In [43]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
import drugs
from drugs import DRUGS_DIC, DRUGS_LIST
import adverse
from adverse import ADVERSE_DIC, PATIENT_FRIENDLY_DIC

In [44]:
#baseline, this model gives F1 of 0.515
SVM_clfr = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clfr', SVC(kernel='linear', class_weight='balanced'))
    ])

#baseline w default class_weight, this model gives F1 of 0.37
# SVM_clfr = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clfr', SVC(kernel='linear'))
#     ])

#baseline w class_weight of .8/.2, this model gives F1 of 0
# SVM_clfr = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clfr', SVC(kernel='linear', class_weight={0:.8, 1:.2}))
#     ])

#baseline w class_weight of .2/.8, this model gives F1 of 0.502
# SVM_clfr = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clfr', SVC(kernel='linear', class_weight={0:.2, 1:.8}))
#     ])

In [45]:
#import training & test data, these are files Danny added to GH repo
common_train = pd.read_csv('train-split.tsv', sep = '\t')
common_test = pd.read_csv('test-split.tsv', sep = '\t')
# common_whole = common_train.append(common_test) #just for EDA purposes

In [46]:
#import list of drugs manually identified in train/test Tweets
drug_list = pd.read_csv('Bryans_drug_list.csv')
longer_drug_list = pd.read_csv('final_drugs.csv')

In [47]:
#funct to iterate through tweet_text & set boolean to True if drug name present, can also return the drug name
def flag_drug(tweet_text):
    tweet_text_lower = tweet_text.lower()
    split_tweet_text = tweet_text_lower.split()
    
    drug_flag = 0
    for i in split_tweet_text:
        if i in longer_drug_list['Lower'].values:
            drug_flag = 1
    return drug_flag

In [48]:
# #funct to iterate through tweet_text & set boolean to True if drug name present, can also return the drug name
# def flag_drug(tweet_text):
#     tweet_text_lower = tweet_text.lower()
#     split_tweet_text = tweet_text_lower.split()
    
#     drug_flag = 0
#     for i in split_tweet_text:
#         if i in drug_list['Lower'].values:
#             drug_flag = 1
#     return drug_flag

In [52]:
#add col to train & test to flag Tweets with drugs
common_train['drug_flag'] = common_train['tweet_text'].apply(flag_drug)
common_test['drug_flag'] = common_test['tweet_text'].apply(flag_drug)
# common_whole['drug_flag'] = common_whole['tweet_text'].apply(flag_drug)

In [56]:
#using longer drug list
#funct to iterate through tweet_text & count number of drug mentions
def drug_count(tweet_text):
    tweet_text_lower = tweet_text.lower()
    split_tweet_text = tweet_text_lower.split()
    
    drug_count = 0
    for i in split_tweet_text:
        if i in longer_drug_list['Lower'].values:
            drug_count += 1
    return drug_count

In [53]:
# #funct to iterate through tweet_text & count number of drug mentions
# def drug_count(tweet_text):
#      tweet_text_lower = tweet_text.lower()
#     split_tweet_text = tweet_text_lower.split()
    
#     drug_count = 0
#     for i in split_tweet_text:
#         if i in drug_list['Lower'].values:
#             drug_count += 1
#     return drug_count

In [57]:
#add col to train & test with count of drug mentions in tweet
common_train['drug_count'] = common_train['tweet_text'].apply(drug_count)
common_test['drug_count'] = common_test['tweet_text'].apply(drug_count)
# common_whole['drug_count'] = common_whole['tweet_text'].apply(drug_count)

In [58]:
#import list of ADRs manually identified in train/test Tweets
adr_list = pd.read_csv('Bryans_ADR_list.csv', encoding='utf-8')
longer_adr_list = pd.read_csv('final_adverse.csv', encoding='utf-8')

In [97]:
# #funct to iterate through tweet_text & see if ADR present
# def find_ADR(tweet_text):
#     tweet_text_lower = tweet_text.lower()

#     adr_flag = 0
#     for i in adr_list['Lower'].values:
#         if i in tweet_text_lower:
#             adr_flag = 1      
#     return adr_flag

In [61]:
# using longer adverse list
#funct to iterate through tweet_text & see if ADR present
def find_ADR(tweet_text):
    tweet_text_lower = tweet_text.lower()

    adr_flag = 0
    for i in longer_adr_list['adverse'].values:
        if i in tweet_text_lower:
            adr_flag = 1      
    return adr_flag

In [62]:
#add col to train & test to flag tweets with ADRs
common_train['adr_flag'] = common_train['tweet_text'].apply(find_ADR)
common_test['adr_flag'] = common_test['tweet_text'].apply(find_ADR)
# common_whole['adr_flag'] = common_whole['tweet_text'].apply(find_ADR)

In [63]:
# #used to generate .corr heatmap
# tweet_subset = common_whole.loc[:,['drug_flag', 'drug_count', 'adr_flag', 'adr_mention']]
# import seaborn as sns
# tweet_subset_matrix = tweet_subset.corr()
# sns.heatmap(tweet_subset_matrix, annot=True);

In [65]:
#fit model to training data
SVM_clfr = SVM_clfr.fit(common_train['tweet_text'],common_train['adr_mention'])
# SVM_clfr = SVM_clfr.fit(common_train[['tweet_text','tweet_id']],common_train['adr_mention']) #this throws a shape/# of samples error

In [66]:
#predicting against training data until we have the train/test split available
predicted = SVM_clfr.predict(common_test['tweet_text'])

In [67]:
common_test['predicted_label'] = predicted

In [68]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
f1 = f1_score(common_test['adr_mention'],predicted)
precision = precision_score(common_test['adr_mention'],predicted)
recall = recall_score(common_test['adr_mention'],predicted)
accuracy = accuracy_score(common_test['adr_mention'],predicted)

In [69]:
print('F1: {}\nPrecision: {}\nRecall: {}\nAccuracy: {}'.format(round(f1,3), round(precision,3), round(recall,3), round(accuracy,3)))

F1: 0.515
Precision: 0.431
Recall: 0.639
Accuracy: 0.886
