Source: https://archive.ics.uci.edu/ml/datasets/Drug+Review+Dataset+%28Drugs.com%29

In [None]:
import pandas as pd

In [None]:
# load data
raw_train = pd.read_csv('./drugsComTrain_raw.tsv', sep='\t')
raw_test = pd.read_csv('./drugsComTest_raw.tsv', sep='\t')
print(raw_train.shape)
print(raw_test.shape)

In [None]:
# combine the to for .csv requirement in assignment
raw_train['cat'] = 'TRAIN'
raw_test['cat'] = 'TEST'
csv = pd.concat([raw_train, raw_test], axis=0)
csv\
    .to_csv('text-mining-data.csv')

In [None]:
# drop empty descriptions
raw_train=raw_train[[len(x)>1 for x in raw_train['review']]]
raw_test=raw_test[[len(x)>1 for x in raw_test['review']]]

In [None]:
raw_train['rating'].hist()

In [None]:
raw_train['outcome'] = [1 if x > 8 else 0 for x in raw_train['rating'] ]
raw_test['outcome'] = [1 if x > 8 else 0 for x in raw_test['rating'] ]

In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
import numpy as np
import multiprocessing, os, json
cores = multiprocessing.cpu_count()

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model_rev = Doc2Vec(
    documents, 
    vector_size=20, 
    window=2, 
    min_count=1, 
    workers=cores)

In [None]:
def d2v(reviews):
    '''reviews should be a list of strings'''
    import re
    # lets only remove punctuations - stop words and numbers are relevant
    revs = [re.sub('[^A-Za-z0-9]+', ' ', x) for x in reviews]
    embed = [model_rev.infer_vector(list(str(x))) for x in revs]
    return pd.DataFrame(embed)

In [None]:
embed_train = d2v(raw_train['review'])
embed_test = d2v(raw_test['review'])

In [None]:
embed_train['outcome'] = raw_train['outcome']
embed_test['outcome'] = raw_test['outcome']

In [None]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    'd2v-xgb' : XGBClassifier(max_depth=14, min_child_weight=0.1, gamma=1.5,nthread=-1),
    'd2v-lr'  : LogisticRegression(solver='lbfgs', multi_class='ovr'),
    'd2v-mlp' : MLPClassifier(hidden_layer_sizes=(100,50)),
    #'knn' : KNeighborsClassifier(n_neighbors=4), # takes too long for inference
    'd2v-dt'  : DecisionTreeClassifier(min_samples_split=2)
}

In [None]:
# train bag of models
for model_name, model in models.items():
    print('Fitting {}'.format(model_name))
    model.fit(
        embed_train.drop('outcome',axis=1).values,
        embed_train['outcome'])

In [None]:
# infer w/ each model
results = {'ACTUAL' : embed_test['outcome'].reset_index(drop=True)}
for model_name, model in models.items():
    print('Predicting: {}'.format(model_name))
    results[model_name] = model.predict(embed_test.drop('outcome',axis=1).values)

In [None]:
from sklearn.metrics import classification_report
for model_name, model in models.items():
    print("Performance Metrics for: {}".format(model_name))
    print(classification_report(results['ACTUAL'], results[model_name]))
    print("-----------------")

# Word Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.parsing.preprocessing import remove_stopwords
def clean(review):
    import re
    # keep only alphas
    revs = [re.sub('[^A-Za-z]+', ' ', x) for x in review]
    cleaned = [remove_stopwords(x) for x in revs]
    return cleaned

In [None]:
raw_test['cleaned'] = clean(raw_test['review'])
raw_train['cleaned'] = clean(raw_train['review'])

In [None]:
test_vec = pd.DataFrame(
    CountVectorizer(max_features=100)\
    .fit_transform(raw_test['cleaned'])\
    .toarray()
    )
train_vec = pd.DataFrame(
    CountVectorizer(max_features=100)\
    .fit_transform(raw_train['cleaned'])\
    .toarray()
    )

In [None]:
test_vec['outcome'] = raw_test['outcome']
train_vec['outcome'] = raw_train['outcome']

In [None]:
models = {
    'vec-xgb' : XGBClassifier(max_depth=14, min_child_weight=0.1, gamma=1.5,nthread=-1),
    'vec-lr'  : LogisticRegression(solver='lbfgs', multi_class='ovr'),
    'vec-mlp' : MLPClassifier(hidden_layer_sizes=(100,50)),
    'vec-dt'  : DecisionTreeClassifier(min_samples_split=2)
}
for model_name, model in models.items():
    print('Fitting {}'.format(model_name))
    model.fit(
        train_vec.drop('outcome',axis=1).values,
        train_vec['outcome'])
# infer w/ each model
# append count vec results to d2v results
for model_name, model in models.items():
    print('Predicting: {}'.format(model_name))
    results[model_name] = model.predict(test_vec.drop('outcome',axis=1).values)

In [None]:
# compute f1-score for all models
from sklearn.metrics import f1_score
mods = list(results.keys())
mods.remove('ACTUAL')
metrics = {}
for mod in mods:
    f1 = f1_score(results['ACTUAL'], results[mod])
    metrics[mod] = round(f1, 3)

In [None]:
pd.DataFrame\
    .from_dict(metrics, orient='index')\
    .to_csv('f1-score.csv', index=True)

In [None]:
pd.DataFrame(results)\
    .to_csv('pred.csv', index=None)