<a href="https://colab.research.google.com/github/ANTTAY001/EEE4022-Final-Year-Project/blob/main/TF_IDF%2BFeatures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Utility Functions**

In [5]:
import pandas as pd
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import Normalizer, StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
import pandas as pd
import numpy as np
import json

#Loading additional helper functions not shown here but provided in the folder
#!cp /content\drive/My Drive\4th year - 2020/Final Year Project - Personal\18 10 2020_Method/Classification_Models\nlp_utils_news.ipynb .
#!python nlp_utils_news.ipynb
#!cp "/content/drive/My Drive/4th year - 2020/Final Year Project - Personal/18 10 2020_Method/Classification_Models/nlp_utils_news.py" .
#from nlp_utils_news import *


'''Features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

'''Metrics/Evaluation'''
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

'''Display'''
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

**Preprocess data**

In [7]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stop words
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

In [18]:
from timeit import default_timer as timer

def get_accuracy(x_train,y_train,model_dict):

  model_name,clf_score_means,clf_score_std,time_elapsed = [],[],[],[]
  for k,v in model_dict.items(): 
    model_name.append(k)
    clf = v
    start = timer()
    clf_score = cross_val_score(clf, x_train, y_train, cv=10)
    end = timer()
    clf_score_means.append(clf_score.mean())
    clf_score_std.append(clf_score.std() * 2)
    time_elapsed.append(end - start) 

    model_comparison_df = pd.DataFrame([model_name, clf_score_means, clf_score_std,time_elapsed]).T
    model_comparison_df.columns = ['model_name', 'accuracy_score', 'std','time_elapsed']
    model_comparison_df = model_comparison_df.sort_values(by='accuracy_score', ascending=False)
    #print(k)
    #print(end - start) 
  return model_comparison_df


**k-fold cross-validation performance**

In [16]:
model_dict = { 'SVM' : svm.SVC(decision_function_shape='ovo'), # SVM for multi-class classification
              'Random Forest': RandomForestClassifier(random_state=123),
              'K Nearest Neighbors': KNeighborsClassifier(),
              'Logistic Regression' : LogisticRegression(multi_class='ovr'),
              'Dummy' : DummyClassifier(random_state=3) #sklearn’s dummy classifier which is just random chance as a baseline
              }

# **Splitting the data: train and validation sets**

In [12]:
# READ IN TRAINING DATA
ds = pd.read_csv("/content/drive/My Drive/4th year - 2020/Final Year Project - Personal/29 10 2020/Training_Data/Merged_NEW.csv") 
da = ds.sample(frac=1) #shuffle the order of the data
da.to_csv('/content/drive/My Drive/4th year - 2020/Final Year Project - Personal/29 10 2020/Training_Data/Merged_NEW_SHUFFLED.csv')

# 80/20 split
# TRAINING
df = pd.read_csv('/content/drive/My Drive/4th year - 2020/Final Year Project - Personal/29 10 2020/Training_Data/Merged_NEW_SHUFFLED.csv')

# Encode classes with numeric labels
label_num_dict = {'Sepsis': 0,
                  'Myocardial Infarcation': 1,
                  'Cardiac Arrest': 2}

df['disease'] = df['disease'].replace(label_num_dict)

# TRAINING 
df_train = df

# TESTING/VALIDATION
df_test = pd.read_csv('/content/drive/My Drive/4th year - 2020/Final Year Project - Personal/29 10 2020/Testing_Data/Merged_VALIDATION_NEW.csv')
df_test['disease'] = df_test['disease'].replace(label_num_dict)
df_test = df_test.sample(frac = 1)



In [13]:
X_train = df_train['description']
X_test = df_test['description']
y_train = df_train['disease']
y_test = df_test['disease']

train_example = X_train[:1] # for report example

print(train_example)
print(f'Train dimensions: {X_train.shape, y_train.shape}')
print(f'Test dimensions: {X_test.shape, y_test.shape}')
# Check out target distribution
print(y_train.value_counts())
print(y_test.value_counts())

0    i ask an ambulance\nmy moms not responsive and...
Name: description, dtype: object
Train dimensions: ((1077,), (1077,))
Test dimensions: ((20,), (20,))
2    378
0    364
1    335
Name: disease, dtype: int64
2    8
0    7
1    5
Name: disease, dtype: int64


# **Featurization**

In [9]:
# Install Magnitude on Google Colab
! echo "Installing Magnitude.... (please wait, can take a while)"
! (curl https://raw.githubusercontent.com/plasticityai/magnitude/master/install-colab.sh | /bin/bash 1>/dev/null 2>/dev/null)
! echo "Done installing Magnitude."

Installing Magnitude.... (please wait, can take a while)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   137  100   137    0     0    617      0 --:--:-- --:--:-- --:--:--   617
Done installing Magnitude.


In [10]:
from nltk import word_tokenize
from nltk.corpus import wordnet 
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
import multiprocessing
import pandas as pd
import os
from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from numpy import hstack
from scipy import sparse
import string
from sklearn.preprocessing import Normalizer, StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
from pymagnitude import *
from nltk.corpus import stopwords 

############### Text Preprocessing ###############

def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stop words
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

############### Features functions ###############

def gastrintestinal_symptoms(df):
    gas_descrips = ['vomiting','diarrhea','nausea','nauseous','indigestion','heartburn','constipated']
    gas_symps = []
    vals = []

    for j in range(len(gas_descrips)): 
     gas_symps.append(preprocess_text(gas_descrips[j])[0]) # Find the lemma of each description

    for i in range(len(gas_symps)):
      syns = wordnet.synsets(gas_symps[i]) # Finds synonyms of descriptions
      for n in range(len(syns)):
        gas_descrips.append(syns[n].lemmas()[0].name())
      
    for description in df.description.values:
     
      if any(word in description for word in gas_descrips):
        vals.append(1)
      else:
        vals.append(0)
    return np.array(vals).reshape(-1,1)

def mental_status_symptoms(df):
    men_descrips = ['unconscious','unresponsive', 'confused','disorientated','stroke','delirium']
    men_symps = []
    vals = []

    for j in range(len(men_descrips)): 
     men_symps.append(preprocess_text(men_descrips[j])[0]) # Find the lemma of each description

    for i in range(len(men_symps)):
      syns = wordnet.synsets(men_symps[i]) # Finds synonyms of descriptions
      for n in range(len(syns)):
        men_descrips.append(syns[n].lemmas()[0].name())

    for description in df.description.values:
     
      if any(word in description for word in men_descrips):
        vals.append(1)
      else:
        vals.append(0)
    return np.array(vals).reshape(-1,1)

def mobility_probs(df):
  weak_legs_descrips_tris = ['not walk', 'unable to stand', 'unable to walk', 'unable to move', 'is lying down', 'need assistance standing','need assistance walking'] 
  weak_legs_descrips_single = ['bedridden','laying']
  res,weak_legs_symps,vals = [],[],[]

  # Finding synonyms of single word descriptors
  for j in range(len(weak_legs_descrips_single)): # finding synonyms for single word descriptors
    weak_legs_symps.append(preprocess_text(weak_legs_descrips_single[j])[0]) # Find the lemma of each description

  for i in range(len(weak_legs_symps)):
    syns = wordnet.synsets(weak_legs_symps[i]) # Find synonyms of descriptions
    for n in range(len(syns)):
      weak_legs_descrips_tris.append(syns[n].lemmas()[0].name()) # adding it to list of tri-gram descriptors

  for description in df.description.values:
    # Checking for single word matches  
    if any(word in description for word in weak_legs_descrips_tris):
      vals.append(1)

    else:
      vals.append(0)
 
  return np.array(vals).reshape(-1,1)


def malaise_symptoms(df):
  mal_descrips = ['sick','ill', 'bad','deteriorated']
  mal_symps = []
  vals = []

  for j in range(len(mal_descrips)): 
     mal_symps.append(preprocess_text(mal_descrips[j])[0]) # Find the lemma of each description

  for i in range(len(mal_symps)):
    syns = wordnet.synsets(mal_symps[i]) # Finds synonyms of descriptions
    for n in range(len(syns)):
      mal_descrips.append(syns[n].lemmas()[0].name())

  for description in df.description.values:
    if any(word in description for word in mal_descrips):
      vals.append(1)
    else:
      vals.append(0)
  return np.array(vals).reshape(-1,1)

def heart_related_pain(df):
  hrp_descrips = ['heart attack', 'heart']
  hrp_symps = []
  vals = []

  for j in range(len(hrp_descrips)): 
     hrp_symps.append(preprocess_text(hrp_descrips[j])[0]) # Find the lemma of each description

  for i in range(len(hrp_symps)):
    syns = wordnet.synsets(hrp_symps[i]) # Finds synonyms of descriptions
    for n in range(len(syns)):
      hrp_descrips.append(syns[n].lemmas()[0].name())

  for description in df.description.values:
    if any(word in description for word in hrp_descrips):
      vals.append(1)
    else:
      vals.append(0)
  return np.array(vals).reshape(-1,1)

def chest_related_no_pain(df):
  ch_descrips = ['pain', 'chest']
  ch_symps = []
  vals= []

  for j in range(len(ch_descrips)): 
     ch_symps.append(preprocess_text(ch_descrips[j])[0]) # Find the lemma of each description

  for i in range(len(ch_symps)):
    syns = wordnet.synsets(ch_symps[i]) # Finds synonyms of descriptions
    for n in range(len(syns)):
      ch_descrips.append(syns[n].lemmas()[0].name())

  for description in df.description.values:
    if any(word in description for word in ch_descrips):
      vals.append(1)
    else:
      vals.append(0)
  return np.array(vals).reshape(-1,1)


def difficult_breathing(df):
  br_descrips = ['not breathing','not breathing','stopped breathing','no breathe','struggling to breathe']
  br_symps,vals = [],[]

  for description in df.description.values:
    if any(word in description for word in br_descrips):  
     vals.append(1)
    else:
     vals.append(0)
  
  return np.array(vals).reshape(-1,1)


def featurize(train_df, test_df):

  print('Gastrointestinal_symptoms...')
  train_gastrintestinal_symptoms = gastrintestinal_symptoms(train_df)
  test_gastrintestinal_symptoms = gastrintestinal_symptoms(test_df)
  
  print('Mental_status_symptoms...')
  train_mental_status_symptoms =  mental_status_symptoms(train_df)
  test_mental_status_symptoms =  mental_status_symptoms(test_df)


  print('Malaise_symptoms...')
  train_malaise_symptoms =malaise_symptoms(train_df)
  test_malaise_symptoms = malaise_symptoms(test_df)

  print('Heart_related_pain...')
  train_heart_related_pain =heart_related_pain(train_df)
  test_heart_related_pain =heart_related_pain(test_df)

  print('Chest_related_no_pain...')
  train_chest_related_no_pain = chest_related_no_pain(train_df)
  test_chest_related_no_pain = chest_related_no_pain(test_df)

  print('Mobility_probs...')
  train_mobility_probs = mobility_probs(train_df)
  test_mobility_probs= mobility_probs(test_df)

  print('Difficult_breathing...')
  train_difficult_breathing = difficult_breathing(train_df)
  test_difficult_breathing= difficult_breathing(test_df)

  print('Tfidf...')
  tfidf_word = TfidfVectorizer(analyzer=preprocess_text)

  train_word_features = tfidf_word.fit_transform(train_df.description.values)
  test_word_features = tfidf_word.transform(test_df.description.values)

  normalizer_tfidf = MinMaxScaler()
  train_embedding_features = sparse.csr_matrix(normalizer_tfidf.fit_transform(train_word_features.todense()))

  test_embedding_features = sparse.csr_matrix(normalizer_tfidf.fit_transform(test_word_features.todense()))
  
  ##############################################################################
  train_features = hstack((train_gastrintestinal_symptoms,
                            train_mental_status_symptoms,
                            train_malaise_symptoms,
                           train_heart_related_pain,
                           train_chest_related_no_pain,
                            train_mobility_probs,
                            train_difficult_breathing,
                            
                            ))
 
  normalizer = MinMaxScaler()
  train_features = normalizer.fit_transform(train_features)
  
  train_features = sparse.csr_matrix(train_features)

  train_features = sparse.hstack((
                          train_features,
                          train_embedding_features
                          ))
  ##############################################################################

  test_features = hstack((test_gastrintestinal_symptoms,
                            test_mental_status_symptoms,
                            test_malaise_symptoms,
                         test_heart_related_pain,
                           test_chest_related_no_pain,
                            test_mobility_probs,
                            test_difficult_breathing,
                            
                            ))
  
  test_features = normalizer.transform(test_features)
  
  test_features = sparse.csr_matrix(test_features)

  test_features = sparse.hstack((
                             test_features,
                             test_embedding_features
                            ))
 
  ##############################################################################

  feature_names = ['gastrintestinal_symptoms','mental_status_symptoms','malaise_symptoms','heart_related_pain','chest_related_no_pain','mobility problems','difficulty breathing']
  feature_names = feature_names + ['tfidf_word_' + col for col in tfidf_word.get_feature_names()] 

  return train_features, test_features, feature_names

# **Get features for TF-IDF + custom features**

In [14]:
train_features, test_features, feature_names = featurize(df_train, df_test)


Gastrointestinal_symptoms...
Mental_status_symptoms...
Malaise_symptoms...
Heart_related_pain...
Chest_related_no_pain...
Mobility_probs...
Difficult_breathing...
Tfidf...


# **K-fold cross validation performance**

In [20]:
features_tfidf = get_accuracy(train_features,y_train,model_dict)
print(features_tfidf)

            model_name accuracy_score  std time_elapsed
0                  SVM           1.00 0.00         4.72
1        Random Forest           1.00 0.00         3.00
2  K Nearest Neighbors           1.00 0.00         0.14
3  Logistic Regression           1.00 0.00         0.55
4                Dummy           0.33 0.08         0.02


# **Testing on unseen data**

In [21]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

for name,model in model_dict.items():
  classifier = model
  classifier.fit(train_features, y_train) 
  y_pred = classifier.predict(test_features)

  print(name)
  #print(confusion_matrix(y_test,y_pred))
 # print(classification_report(y_test,y_pred))
  print(accuracy_score(y_test, y_pred))

SVM
0.85
Random Forest
0.95
K Nearest Neighbors
0.55
Logistic Regression
0.9
Dummy
0.45


# **GridSearchCV**

**SVM**

In [25]:
sgd_clf2 = svm.SVC(decision_function_shape='ovo')

svm_params = {'C' : [10**(x) for x in range(-1,4)],
             'kernel' : ['poly', 'rbf', 'linear'],
              'degree' : [2, 3]
            }

search_log_reg = GridSearchCV(estimator=sgd_clf2, param_grid=svm_params, cv=10)
search_log_reg.fit(train_features, y_train)  
print(search_log_reg.best_params_)
print("Cross-validation:")
print(search_log_reg.best_score_)

classifier = search_log_reg.best_estimator_
start = timer()
classifier.fit(train_features, y_train) 
y_pred = classifier.predict(test_features)
end = timer()
print("Unseen:")
print(end - start)
print("Accuracy:  %0.2f "% accuracy_score(y_test, y_pred))
#print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test,y_pred))


{'C': 0.1, 'degree': 2, 'kernel': 'poly'}
Cross-validation:
1.0
Unseen:
0.5785553799996705
Accuracy:  0.85 


**KNN**

In [26]:
sgd_clf2 = KNeighborsClassifier(n_jobs = -1)
knn_params = { 'n_neighbors' : [3, 5, 7, 9, 15, 31], 
               'weights' : ['uniform', 'distance']
}


search = GridSearchCV(estimator=sgd_clf2, param_grid=knn_params, cv=10)
search.fit(train_features, y_train)  #test_features, y_test)
print(search.best_params_)

print("Cross-validation:")
print(search.best_score_)

classifier = search.best_estimator_
start = timer()
classifier.fit(train_features, y_train) 
y_pred = classifier.predict(test_features)
end = timer()
print("Unseen:")
print(end - start)
print("Accuracy:  %0.2f "% accuracy_score(y_test, y_pred))
#print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test,y_pred))



{'n_neighbors': 3, 'weights': 'uniform'}
Cross-validation:
1.0
Unseen:
0.10612198399940098
Accuracy:  0.55 


**Random Forest**

In [27]:
from timeit import default_timer as timer
sgd_clf2 = RandomForestClassifier(n_jobs = -1)
rf_params = { 'n_estimators' : [10, 100, 250, 500, 1000], 
               'max_depth' : [None, 3, 7, 15],
               'min_samples_split' : [2, 5, 15,30]
}

search = GridSearchCV(estimator=sgd_clf2, param_grid=rf_params, cv=10)
search.fit(train_features, y_train)  #test_features, y_test)
print(search.best_params_)

print("Cross-validation:")
print(search.best_score_)

classifier = search.best_estimator_
start = timer()
classifier.fit(train_features, y_train) 
y_pred = classifier.predict(test_features)
end = timer()
print("Unseen:")
print(end - start)
print("Accuracy:  %0.2f "% accuracy_score(y_test, y_pred))
#print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test,y_pred))


{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 10}
Cross-validation:
1.0
Unseen:
0.21049348700034898
Accuracy:  0.80 


**Logistic Regression**

In [28]:
sgd_clf2 = LogisticRegression()

rf_params= {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']}

search = GridSearchCV(estimator=sgd_clf2, param_grid=rf_params, cv=10)
search.fit(train_features, y_train)  #test_features, y_test)
print(search.best_params_)

print("Cross-validation:")
print(search.best_score_)

classifier = search.best_estimator_
start = timer()
classifier.fit(train_features, y_train) 
y_pred = classifier.predict(test_features)
end = timer()
print("Unseen:")
print(end - start)
print("Accuracy:  %0.2f "% accuracy_score(y_test, y_pred))
#print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test,y_pred))


{'C': 0.012742749857031334, 'penalty': 'l2', 'solver': 'liblinear'}
Cross-validation:
1.0
Unseen:
0.012052637999659055
Accuracy:  0.90 


# **Feature Weights/Importance**

In [None]:
!pip install eli5

In [31]:
from sklearn.linear_model import SGDClassifier
import eli5

# Train a Log Reg Classifier
log_reg = SGDClassifier(loss = 'log', n_jobs = -1, alpha = 5e-2)
log_reg.fit(train_features, y_train)

SGDClassifier(alpha=0.05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [32]:
#Pass the model instance along with the feature names to ELI5
eli5.show_weights(log_reg, feature_names = feature_names, top = 100)

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+0.437,tfidf_word_need,
+0.270,tfidf_word_doctor,
+0.257,tfidf_word_ja,
+0.253,mental_status_symptoms,
+0.236,tfidf_word_well,
+0.213,tfidf_word_umm,
+0.213,tfidf_word_walk,
+0.172,tfidf_word_mother,
+0.171,tfidf_word_anything,
+0.161,tfidf_word_pressure,

Weight?,Feature
+0.437,tfidf_word_need
+0.270,tfidf_word_doctor
+0.257,tfidf_word_ja
+0.253,mental_status_symptoms
+0.236,tfidf_word_well
+0.213,tfidf_word_umm
+0.213,tfidf_word_walk
+0.172,tfidf_word_mother
+0.171,tfidf_word_anything
+0.161,tfidf_word_pressure

Weight?,Feature
+0.586,heart_related_pain
+0.384,tfidf_word_chest
+0.375,tfidf_word_ya
+0.341,tfidf_word_heart
+0.231,tfidf_word_street
+0.215,tfidf_word_attack
+0.215,tfidf_word_nber
+0.204,chest_related_no_pain
+0.179,tfidf_word_code
+0.170,tfidf_word_ill

Weight?,Feature
+0.458,tfidf_word_uhm
+0.304,tfidf_word_man
+0.282,tfidf_word_oky
+0.228,tfidf_word_lie
+0.223,tfidf_word_still
+0.216,tfidf_word_say
+0.191,tfidf_word_c
+0.188,tfidf_word_nothing
+0.185,tfidf_word_n
+0.179,tfidf_word_police


# **Feature Selection - SelectKBest**

In [33]:
from sklearn.feature_selection import SelectKBest
#Lets run a for loop to select the value of K that optimizes F1

f1_scores = []
clf3=SGDClassifier(random_state=3, loss='log')
for k in tqdm_notebook(range(1, train_features.shape[1])):
    selector = SelectKBest(k = k)
    train_features_selected = selector.fit_transform(train_features, y_train)
    test_features_selected = selector.transform(test_features)
    clf_mean = cross_val_score(clf3, train_features_selected, y_train, cv=10)
    f1_scores.append(clf_mean.mean())
f1_scores = np.array(f1_scores)

HBox(children=(FloatProgress(value=0.0, max=2033.0), HTML(value='')))




**Number of best performing features**

In [34]:
new_k = np.argmax(f1_scores) + 1
print(new_k)

112


# **k-fold cross-validation on selected features**

In [35]:
get_accuracy(train_features_selected, y_train, model_dict)

Unnamed: 0,model_name,accuracy_score,std,time_elapsed
0,SVM,1.0,0.0,4.78
1,Random Forest,1.0,0.0,3.16
2,K Nearest Neighbors,1.0,0.0,0.13
3,Logistic Regression,1.0,0.0,0.56
4,Dummy,0.33,0.08,0.02


# **Performance of selected features on unseen data**

In [36]:
selector2 = SelectKBest(k = new_k)
train_features_selected = selector2.fit_transform(train_features, y_train)
test_features_selected = selector2.transform(test_features)
#get_accuracy(train_features_selected, y_train, model_dict)
#get_accuracy(test_features_selected, y_test, model_dict)

for name,model in model_dict.items():
  classifier = model
  classifier.fit(train_features_selected, y_train) 
  y_pred = classifier.predict(test_features_selected)

  print(name)
  #print(confusion_matrix(y_test,y_pred))
 # print(classification_report(y_test,y_pred))
  print(accuracy_score(y_test, y_pred))


SVM
0.9
Random Forest
0.85
K Nearest Neighbors
0.75
Logistic Regression
0.9
Dummy
0.45


In [37]:
np.array(feature_names)[selector2.get_support()]

array(['gastrintestinal_symptoms', 'mental_status_symptoms',
       'malaise_symptoms', 'heart_related_pain', 'chest_related_no_pain',
       'tfidf_word_aid', 'tfidf_word_alphen', 'tfidf_word_also',
       'tfidf_word_answer', 'tfidf_word_anything', 'tfidf_word_arrange',
       'tfidf_word_attack', 'tfidf_word_beg', 'tfidf_word_body',
       'tfidf_word_bonitas', 'tfidf_word_brother', 'tfidf_word_c',
       'tfidf_word_cannot', 'tfidf_word_case', 'tfidf_word_chest',
       'tfidf_word_closest', 'tfidf_word_code', 'tfidf_word_company',
       'tfidf_word_condition', 'tfidf_word_corner', 'tfidf_word_dead',
       'tfidf_word_dementia', 'tfidf_word_dis', 'tfidf_word_doctor',
       'tfidf_word_eat', 'tfidf_word_everything', 'tfidf_word_father',
       'tfidf_word_feel', 'tfidf_word_female', 'tfidf_word_fund',
       'tfidf_word_get', 'tfidf_word_give', 'tfidf_word_gonna',
       'tfidf_word_h', 'tfidf_word_half', 'tfidf_word_happen',
       'tfidf_word_heart', 'tfidf_word_hes', 'tfidf_wo