In [1]:
import pandas as pd
import numpy as np

### 1. Read data into dataframe

In [2]:
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfnonpro = data2df('HealthProNonPro/NonPro/', 0) # NEG- Answer not as good as professional advice
dfpro = data2df('HealthProNonPro/Pro/', 1) # POS- Answer as good as professional advice

df = pd.concat([dfpro, dfnonpro], axis=0)
#df.sample(frac=0.005)

### 2. Setup the data for Training/Testing. Use 20% for testing

In [3]:
X,y = df['text'], df['class']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

### 3. Use Spacy to preprocess the data. Explore and pick appropriate preprocessing steps.

In [4]:
!pip install spacy



In [5]:
!python -m spacy download en_core_web_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [6]:
def custom_tokenizer(doc):
    tokens = [token.lemma_.lower()
             for token in doc
             if(
             not token.is_punct and
             not token.is_space and
             not token.is_stop)]
    return " ".join(tokens)

In [7]:
%%time

import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(list(Xtrain))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtrain = pd.Series(clean_corpus)
Xtrain.head()

CPU times: user 48.7 s, sys: 2.81 s, total: 51.5 s
Wall time: 41.2 s


0    carotid artery disease cause build plaque insi...
1                                          baking soda
2                      course break think u break bone
3    antibiotic start show result 24 48 hour take d...
4    male equivalent yeast infection pass forth giv...
dtype: object

### 4. Setup a Pipeline with TfidfVectorizer and Naïve Bayes. 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('tfidf',TfidfVectorizer(encoding="utf-8",lowercase=True,ngram_range=(1,1),
                             use_idf=True, smooth_idf=True,sublinear_tf=False,
                            vocabulary=None)),
    ("clf",MultinomialNB(alpha=1.0))
])

In [9]:
X = pipeline.fit(Xtrain,ytrain)
ypred = pipeline.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.7939972714870396
[[344  14]
 [137 238]]
              precision    recall  f1-score   support

           0       0.72      0.96      0.82       358
           1       0.94      0.63      0.76       375

    accuracy                           0.79       733
   macro avg       0.83      0.80      0.79       733
weighted avg       0.83      0.79      0.79       733



### 5. Grid Search with 4-fold Cross Validation to search for the best values for the following two hyper-parameters 
#### a. sublinear_tf in TfidfVectorizer
#### b. alpha in Naïve Bayes

In [10]:
from sklearn.model_selection import GridSearchCV

grid_params = {'tfidf__sublinear_tf': (True,False), 
               'clf__alpha': (0.1,0.25,0.5,1)}

In [11]:
grid_sr = GridSearchCV(estimator=pipeline,param_grid=grid_params,cv=4)

In [12]:
grid_sr.fit(Xtrain,ytrain)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

### 6.	Print Evaluation metrics using the Best Estimator resulting from the Grid Search for Prediction/Evaluation

In [13]:
best_parameters = grid_sr.best_params_
print(best_parameters)
print(grid_sr.best_score_)

{'clf__alpha': 0.1, 'tfidf__sublinear_tf': False}
0.9392076502732241


In [14]:
corpus = nlp.pipe(list(Xtest))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtest = pd.Series(clean_corpus)

In [15]:
ypred1 = grid_sr.best_estimator_.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred1))
print (metrics.confusion_matrix(ytest, ypred1))
print (metrics.classification_report(ytest, ypred1))

0.9345156889495225
[[319  39]
 [  9 366]]
              precision    recall  f1-score   support

           0       0.97      0.89      0.93       358
           1       0.90      0.98      0.94       375

    accuracy                           0.93       733
   macro avg       0.94      0.93      0.93       733
weighted avg       0.94      0.93      0.93       733



### 7.	Extract the true negatives (TN), false positives (FP), false negatives (FN), and true positives (TP) 

In [16]:
TN, FP, FN, TP = metrics.confusion_matrix(y_true=ytest, y_pred=ypred1).ravel()

In [17]:
accuracy = (TP+TN)/(TP+FP+FN+TN)
print("Accuracy: ",accuracy)

Accuracy:  0.9345156889495225


In [18]:
precision_1 = TP/(TP+FP)
precision_0 = TN/(TN+FN)
print("Precision for 1:",precision_1)
print("Precision for 0:",precision_0)

Precision for 1: 0.9037037037037037
Precision for 0: 0.9725609756097561


In [19]:
recall_1 = TP/(TP+FN)
recall_0 = TN/(TN+FP)
print("Recall for 1:", recall_1)
print("Recall for 0:",recall_0)

Recall for 1: 0.976
Recall for 0: 0.8910614525139665


In [20]:
f1_score_1 = (2*recall_1*precision_1)/(recall_1+precision_1)
f1_score_0 = (2*recall_0*precision_0)/(recall_0+precision_0)
print("f1 score for 1:",f1_score_1)
print("f1 score for 0:",f1_score_0)

f1 score for 1: 0.9384615384615385
f1 score for 0: 0.9300291545189504
