This report will analyze two approaches to text classification. 

Source: http://localhost:8888/?token=858b3d9f0fa67838d04a899b00e532c7c0b92f3a73ec6357


The dataset provides patient reviews on specific drugs along with related conditions and a 10 star patient rating reflecting overall patient satisfaction. The data was obtained by crawling online pharmaceutical review sites. The intention was to study 
(1) sentiment analysis of drug experience over multiple facets, i.e. sentiments learned on specific aspects such as effectiveness and side effects, 
(2) the transferability of models among domains, i.e. conditions, and 
(3) the transferability of models among different data sources (see 'Drug Review Dataset (Druglib.com)'). 

The purpose is to compare the performance of text classificaiton approaches, using

In [1]:
import pandas as pd

In [2]:
# load data
raw_train = pd.read_csv('./drugsComTrain_raw.tsv', sep='\t')
raw_test = pd.read_csv('./drugsComTest_raw.tsv', sep='\t')
print(raw_train.shape)
print(raw_test.shape)

(161297, 7)
(53766, 7)


In [3]:
# drop empty descriptions
raw_train=raw_train[[len(x)>1 for x in raw_train['review']]]
raw_test=raw_test[[len(x)>1 for x in raw_test['review']]]

In [4]:
raw_train['rating'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fcbb257d9e8>

In [5]:
raw_train['outcome'] = [1 if x > 8 else 0 for x in raw_train['rating'] ]
raw_test['outcome'] = [1 if x > 8 else 0 for x in raw_test['rating'] ]

In [6]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
import numpy as np
import multiprocessing, os, json
cores = multiprocessing.cpu_count()

In [7]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model_rev = Doc2Vec(
    documents, 
    vector_size=20, 
    window=2, 
    min_count=1, 
    workers=cores)

In [8]:
def d2v(reviews):
    '''reviews should be a list of strings'''
    import re
    # lets only remove punctuations - stop words and numbers are relevant
    revs = [re.sub('[^A-Za-z0-9]+', ' ', x) for x in reviews]
    embed = [model_rev.infer_vector(list(str(x))) for x in revs]
    return pd.DataFrame(embed)

In [9]:
embed_train = d2v(raw_train['review'])
embed_test = d2v(raw_test['review'])

In [10]:
embed_train['outcome'] = raw_train['outcome']
embed_test['outcome'] = raw_test['outcome']

In [11]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    'd2v-xgb' : XGBClassifier(max_depth=14, min_child_weight=0.1, gamma=1.5,nthread=-1),
    'd2v-lr'  : LogisticRegression(solver='lbfgs', multi_class='ovr'),
    'd2v-mlp' : MLPClassifier(hidden_layer_sizes=(100,50)),
    #'knn' : KNeighborsClassifier(n_neighbors=4), # takes too long for inference
    'd2v-dt'  : DecisionTreeClassifier(min_samples_split=2)
}

In [12]:
# train bag of models
for model_name, model in models.items():
    print('Fitting {}'.format(model_name))
    model.fit(
        embed_train.drop('outcome',axis=1).values,
        embed_train['outcome'])

Fitting d2v-xgb
Fitting d2v-lr
Fitting d2v-mlp
Fitting d2v-dt


In [13]:
# infer w/ each model
results = {'ACTUAL' : embed_test['outcome'].reset_index(drop=True)}
for model_name, model in models.items():
    print('Predicting: {}'.format(model_name))
    results[model_name] = model.predict(embed_test.drop('outcome',axis=1).values)

Predicting: d2v-xgb
Predicting: d2v-lr
Predicting: d2v-mlp
Predicting: d2v-dt


In [14]:
from sklearn.metrics import classification_report
for model_name, model in models.items():
    print("Performance Metrics for: {}".format(model_name))
    print(classification_report(results['ACTUAL'], results[model_name]))
    print("-----------------")

Performance Metrics for: d2v-xgb
              precision    recall  f1-score   support

           0       0.72      0.79      0.75     27573
           1       0.76      0.67      0.71     26193

   micro avg       0.74      0.74      0.74     53766
   macro avg       0.74      0.73      0.73     53766
weighted avg       0.74      0.74      0.73     53766

-----------------
Performance Metrics for: d2v-lr
              precision    recall  f1-score   support

           0       0.51      0.98      0.67     27573
           1       0.48      0.02      0.03     26193

   micro avg       0.51      0.51      0.51     53766
   macro avg       0.49      0.50      0.35     53766
weighted avg       0.49      0.51      0.36     53766

-----------------
Performance Metrics for: d2v-mlp
              precision    recall  f1-score   support

           0       0.51      0.69      0.59     27573
           1       0.49      0.31      0.38     26193

   micro avg       0.51      0.51      0.51     

# Word Vectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.parsing.preprocessing import remove_stopwords
def clean(review):
    import re
    # keep only alphas
    revs = [re.sub('[^A-Za-z]+', ' ', x) for x in review]
    cleaned = [remove_stopwords(x) for x in revs]
    return cleaned

In [16]:
raw_test['cleaned'] = clean(raw_test['review'])
raw_train['cleaned'] = clean(raw_train['review'])

In [31]:
test_vec = pd.DataFrame(
    CountVectorizer(max_features=100)\
    .fit_transform(raw_test['cleaned'])\
    .toarray()
    )
train_vec = pd.DataFrame(
    CountVectorizer(max_features=100)\
    .fit_transform(raw_train['cleaned'])\
    .toarray()
    )

In [32]:
test_vec['outcome'] = raw_test['outcome']
train_vec['outcome'] = raw_train['outcome']

In [None]:
models = {
    'vec-xgb' : XGBClassifier(max_depth=14, min_child_weight=0.1, gamma=1.5,nthread=-1),
    'vec-lr'  : LogisticRegression(solver='lbfgs', multi_class='ovr'),
    'vec-mlp' : MLPClassifier(hidden_layer_sizes=(100,50)),
    'vec-dt'  : DecisionTreeClassifier(min_samples_split=2)
}
for model_name, model in models.items():
    print('Fitting {}'.format(model_name))
    model.fit(
        train_vec.drop('outcome',axis=1).values,
        train_vec['outcome'])
# infer w/ each model
# append count vec results to d2v results
for model_name, model in models.items():
    print('Predicting: {}'.format(model_name))
    results[model_name] = model.predict(test_vec.drop('outcome',axis=1).values)

Fitting vec-xgb
Fitting vec-lr
Fitting vec-mlp


In [None]:
# compute f1-score for all models
from sklearn.metrics import f1_score
mods = list(results.keys())
mods.remove('ACTUAL')
metrics = {}
for mod in mods:
    f1 = f1_score(results['ACTUAL'], results[mod])
    metrics[mod] = round(f1, 3)

In [None]:
metrics