#Notebook on Zindi Mental Health Competition

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
train=pd.read_csv('Train.csv')
test=pd.read_csv('Test.csv')

In [4]:
print('Train shape:',train.shape,'and number of null values are:',train.isnull().sum())
train.head()

Train shape: (616, 3) and number of null values are: ID       0
text     0
label    0
dtype: int64


Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression
1,9JDAGUV3,Why do I get hallucinations?,Drugs
2,419WR1LQ,I am stresseed due to lack of financial suppor...,Depression
3,6UY7DX6Q,Why is life important?,Suicide
4,FYC0FTFB,How could I be helped to go through the depres...,Depression


In [5]:
print('Test shape:',test.shape,'and number of null values are:',test.isnull().sum())
test.head()

Test shape: (309, 2) and number of null values are: ID      0
text    0
dtype: int64


Unnamed: 0,ID,text
0,02V56KMO,How to overcome bad feelings and emotions
1,03BMGTOK,I feel like giving up in life
2,03LZVFM6,I was so depressed feel like got no strength t...
3,0EPULUM5,I feel so low especially since I had no one to...
4,0GM4C5GD,can i be successful when I am a drug addict?


#Lets try a CountVectorizer

In [6]:
vect = CountVectorizer()
X_train_dtm=vect.fit_transform(train['text'])
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,abandoned,able,about,absent,abuse,academic,academics,accept,add,addict,...,worried,worst,worth,would,wronged,yet,you,young,your,youths
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
614,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X_test_dtm=vect.transform(test['text'])

In [8]:
y_train=train['label'].factorize()

In [12]:
y_train[1]

Index(['Depression', 'Drugs', 'Suicide', 'Alcohol'], dtype='object')

In [14]:
import xgboost as xgb
from sklearn.model_selection import cross_validate
model=xgb.XGBClassifier()
cv_results = cross_validate(model, X_train_dtm, y_train[0], cv=3,scoring=('accuracy', 'neg_log_loss'),return_train_score=True)

ModuleNotFoundError: No module named 'xgboost'

In [0]:
sorted(cv_results.keys())

['fit_time',
 'score_time',
 'test_accuracy',
 'test_neg_log_loss',
 'train_accuracy',
 'train_neg_log_loss']

In [0]:
cv_results['test_neg_log_loss'],cv_results['train_neg_log_loss']

(array([-0.44084763, -0.5742301 , -0.55187825]),
 array([-0.33764278, -0.27203583, -0.29129758]))

In [0]:
cv_results['test_accuracy'],cv_results['train_accuracy']

(array([0.85436893, 0.80487805, 0.8097561 ]),
 array([0.89512195, 0.9026764 , 0.90024331]))

#Now let us try TF-IDF

In [0]:
tf_vect=TfidfVectorizer()
X_train_tdtm=tf_vect.fit_transform(train['text'])
X_test_tdtm=tf_vect.transform(test['text'])

In [0]:
model1=xgb.XGBClassifier()
cv_results_tf = cross_validate(model1, X_train_tdtm, y_train[0], cv=3,scoring=('accuracy', 'neg_log_loss'),return_train_score=True)

In [0]:
cv_results_tf['test_neg_log_loss'],cv_results_tf['train_neg_log_loss']

(array([-0.48974735, -0.60569255, -0.56573606]),
 array([-0.26534189, -0.20991587, -0.22979633]))

In [0]:
cv_results_tf['test_accuracy'],cv_results_tf['train_accuracy']

(array([0.82038835, 0.8097561 , 0.79512195]),
 array([0.92439024, 0.94890511, 0.94403893]))

#Now let us try to combine TF-IDF & CountVectorizer

In [0]:
import scipy.sparse
X_train = scipy.sparse.hstack([X_train_dtm, X_train_tdtm])
model_comb=xgb.XGBClassifier()
cv_results_comb = cross_validate(model_comb, X_train, y_train[0], cv=3,scoring=('accuracy', 'neg_log_loss'),return_train_score=True)

In [0]:
cv_results_comb['test_accuracy'],cv_results_comb['train_accuracy']

(array([0.83009709, 0.8097561 , 0.79512195]),
 array([0.92439024, 0.94890511, 0.94647202]))

In [0]:
cv_results_comb['test_neg_log_loss'],cv_results_comb['train_neg_log_loss']

(array([-0.49207947, -0.60488936, -0.55899634]),
 array([-0.26643764, -0.20981581, -0.22932442]))

#Lets do a prediction

In [0]:
sub=pd.read_csv('SampleSubmission.csv')
sub.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0,0,0,0
1,03BMGTOK,0,0,0,0
2,03LZVFM6,0,0,0,0
3,0EPULUM5,0,0,0,0
4,0GM4C5GD,0,0,0,0


In [0]:
X_test = scipy.sparse.hstack([X_test_dtm, X_test_tdtm])

In [0]:
model_comb.fit(X_train,y_train[0])
preds=model_comb.predict_proba(X_test)

In [0]:
preds.shape

(309, 4)

In [0]:
sub['Depression']=preds[:,0]
sub['Alcohol']=preds[:,3]
sub['Suicide']=preds[:,2]
sub['Drugs']=preds[:,1]

In [0]:
sub.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.520719,0.146882,0.261715,0.070684
1,03BMGTOK,0.974787,0.005327,0.016541,0.003345
2,03LZVFM6,0.985065,0.00575,0.006193,0.002992
3,0EPULUM5,0.845713,0.039992,0.087932,0.026362
4,0GM4C5GD,0.411991,0.080869,0.039919,0.467221


In [0]:
sub.to_csv('Sub_CountVect_TFIdf.csv',index=False)