# Classifiers - Naive Bayes and Random Forest Classifier

### Base Models = Neha, Tuned Models = Danny

In [1]:
import pandas as pd
from sklearn import datasets, model_selection, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV, train_test_split

# Neha:

## Importing Data

In [2]:
#cleaned and processed data using Grace's pipeline
df_train = pd.read_csv('train_split_processed.tsv', delimiter = '\t')
df_test = pd.read_csv('test_split_processed.tsv', delimiter = '\t')

In [3]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,adr_mention,tweet_text
0,0,451439947401031682,0,""" "" "" well , the way i see it , you can either..."
1,1,341735936406732801,0,if i run out of prozac i ama be pissed man ugh...
2,2,498679682779611136,0,my friend told me she liked nexplanon more tha...
3,3,349139925746192384,0,<user> ferry corsten </user> <user> ferry cors...
4,4,349038276369330176,0,<user> sarah tw33ts </user> i am on 40mg of fl...


In [4]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,adr_mention,tweet_text
0,0,342260371878449153,0,<user> joanne __howe </user> so sorry to hear ...
1,1,349023751842709504,0,my right hand is so weak i can not even snap m...
2,2,539972840117002241,0,sorry i suggested imodium for diarrhea of the ...
3,3,340450765535576064,0,do you know what is badass ? ciprofloxacin tha...
4,4,529116068196151296,0,i go out walking on lunesta <hashtag> classic ...


## Naive Bayes Model

### Base Model

In [5]:
model = Pipeline(memory=None, steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict', encoding='utf-8', input='content',
       lowercase=True, max_df=1.0, max_features=None, min_df=1,
       ngram_range=(1, 3), norm='l1', preprocessor=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False))])

model.fit(df_train['tweet_text'], df_train['adr_mention'])

labels = model.predict(df_test['tweet_text'])

In [6]:
# testing code using Bryan's code
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
f1 = f1_score(df_test['adr_mention'], labels)
precision = precision_score(df_test['adr_mention'], labels)
recall = recall_score(df_test['adr_mention'], labels)
accuracy = accuracy_score(df_test['adr_mention'], labels)
print('F1: {}\nPrecision: {}\nRecall: {}\nAccuracy: {}'.format(round(f1,3), round(precision,3), round(recall,3), round(accuracy,3)))

F1: 0.004
Precision: 1.0
Recall: 0.002
Accuracy: 0.906


### Fine-tuned Parameters

In [None]:
model2 = Pipeline(memory=None, steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict', encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l1', preprocessor=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False))])

parameters = {   
    'tfidf__use_idf': (True, False),  
    'tfidf__sublinear_tf': (True, False),  
    'tfidf__ngram_range': [(1,1),(1,2),(1,3)],
    'tfidf__norm': ('l1', 'l2'),  
    'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)  
    } 

model_tuned = model_selection.GridSearchCV(model2, parameters, cv = 10, n_jobs = -1)

model_tuned.fit(df_train['tweet_text'], df_train['adr_mention'])
labels2 = model_tuned.predict(df_test['tweet_text'])

In [None]:
# testing code using Bryan's code
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
f1 = f1_score(df_test['adr_mention'], labels2)
precision = precision_score(df_test['adr_mention'], labels2)
recall = recall_score(df_test['adr_mention'], labels2)
accuracy = accuracy_score(df_test['adr_mention'], labels2)
print('F1: {}\nPrecision: {}\nRecall: {}\nAccuracy: {}'.format(round(f1,3), round(precision,3), round(recall,3), round(accuracy,3)))

## Random Forest Classifier

### Base Model

In [8]:
model3 = Pipeline(
       [
               ('vect', CountVectorizer(decode_error = 'ignore', strip_accents = 'unicode')),
               ('tfidf', TfidfTransformer(use_idf = False)),
               ('clf', RandomForestClassifier())
               ]
       )
model3 = model3.fit(df_train['tweet_text'], df_train['adr_mention'])
predicted = model3.predict(df_test['tweet_text'])



In [10]:
# testing code using Bryan's code
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
f1 = f1_score(df_test['adr_mention'],predicted)
precision = precision_score(df_test['adr_mention'],predicted)
recall = recall_score(df_test['adr_mention'],predicted)
accuracy = accuracy_score(df_test['adr_mention'],predicted)
print('F1: {}\nPrecision: {}\nRecall: {}\nAccuracy: {}'.format(round(f1,3), round(precision,3), round(recall,3), round(accuracy,3)))

F1: 0.093
Precision: 0.75
Recall: 0.049
Accuracy: 0.909


### Fine-tuned Parameters

In [11]:
model4 = Pipeline(
       [
               ('vect', CountVectorizer(decode_error = 'ignore', strip_accents = 'unicode')),
               ('tfidf', TfidfTransformer(use_idf = True)),
               ('clf', RandomForestClassifier())
               ]
       )
model4 = model4.fit(df_train['tweet_text'], df_train['adr_mention'])
predicted = model4.predict(df_test['tweet_text'])



In [12]:
# testing code using Bryan's code
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
f1 = f1_score(df_test['adr_mention'],predicted)
precision = precision_score(df_test['adr_mention'],predicted)
recall = recall_score(df_test['adr_mention'],predicted)
accuracy = accuracy_score(df_test['adr_mention'],predicted)
print('F1: {}\nPrecision: {}\nRecall: {}\nAccuracy: {}'.format(round(f1,3), round(precision,3), round(recall,3), round(accuracy,3)))

F1: 0.094
Precision: 0.857
Recall: 0.049
Accuracy: 0.909


# Danny: