<a href="https://colab.research.google.com/github/9keshu/Depression-Analysis/blob/main/basicModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Start writing code here...
import pandas as pd
import xgboost
import re
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import f1_score

In [None]:
train = pd.read_csv('trainCSV.csv')


In [None]:
test = pd.read_csv('testCSV.csv')

In [None]:
train.head(10)

Unnamed: 0,id,tweet,label
0,1.0,Feeling a bit depressedI've been in a big low ...,depression
1,2.0,Was going to hang myself but didn't have guts ...,depression
2,3.0,Have you ever maintained a poor friendship jus...,depression
3,4.0,I haven't felt positive feelings in a long tim...,depression
4,5.0,Partners of those who suffer from depression; ...,depression
5,6.0,I'm worthless.I've gotten whinier and weaker a...,depression
6,7.0,What's the best way to say 'Goodbye'?There's a...,depression
7,8.0,I just want to fucking end itI have nothing el...,depression
8,9.0,Help with Nightmares/night terrors?Its bad eno...,depression
9,10.0,"My depression has lasted over a month, and it ...",depression


In [None]:
labels = ['depression','Non-Depressed']
def label_encode(val):
    return labels.index(val)

In [None]:
#Label Encoding

In [None]:
train = train.dropna(axis=0)

In [None]:
train.label = train.label.apply(label_encode)

In [None]:
train.head(10)

Unnamed: 0,id,tweet,label
0,1.0,Feeling a bit depressedI've been in a big low ...,0
1,2.0,Was going to hang myself but didn't have guts ...,0
2,3.0,Have you ever maintained a poor friendship jus...,0
3,4.0,I haven't felt positive feelings in a long tim...,0
4,5.0,Partners of those who suffer from depression; ...,0
5,6.0,I'm worthless.I've gotten whinier and weaker a...,0
6,7.0,What's the best way to say 'Goodbye'?There's a...,0
7,8.0,I just want to fucking end itI have nothing el...,0
8,9.0,Help with Nightmares/night terrors?Its bad eno...,0
9,10.0,"My depression has lasted over a month, and it ...",0


In [None]:
#Cleaning training and test data

In [None]:
train = train.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = []

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
train.tweet = train.tweet.apply(clean_text)
train.tweet = train.tweet.str.replace('\d+', '')

In [None]:
#Preparing Test Data

In [None]:
test = test.dropna(axis=0)

In [None]:

test.label = test.label.apply(label_encode)
test = test.reset_index(drop=True)
test.tweet = test.tweet.apply(clean_text)
test.tweet = test.tweet.str.replace('\d+', '')

In [None]:
train.tweet.sample(10)

20      being aloneas i sit here typing all this just ...
3099    hi r baseball have you ever caught a foulball ...
1911    i just want to talk to someone about thishey r...
3510    following the coup detat and installation of a...
1246    i dont know what to dodisclaimer not necessari...
2259    struggling immensely with anxiety and obsessiv...
4475    ok so im a highschool bowler ive improved alot...
3346    im just trying to play a nascar game and this ...
778     its been a long  yearsim turning  next week im...
2963    hey guys played my first game yesterday and my...
Name: tweet, dtype: object

In [None]:
#Building Model

In [None]:

def train_model(classifier, feature_vector_train, label,  feature_vector_valid, valid_y,test_data , test_label ,is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    #print("In Validation Data",metrics.accuracy_score(predictions, valid_y))
    #applying in test data
    predictions_test = classifier.predict(test_data)
    
    if is_neural_net:
        predictions_test = predictions_test.argmax(axis=-1)
    print("f1 score: ",f1_score(test_label,predictions_test))
        
    return metrics.accuracy_score(test_label,predictions_test)

In [None]:
#1.Splitting the Data into Train and validation

In [None]:

# split the dataset into training and validation datasets 
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(train['tweet'], train['label'], test_size=0.30)

In [None]:
#2. Applying WordLevel tf-idf and bi-gram tf-idf

In [None]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train['tweet'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
test_tfidf   =  tfidf_vect.transform(test['tweet'])

# ngram level tf-idf (bigram in this case)
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train['tweet'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.fit_transform(valid_x)
test_tfidf_ngram   =  tfidf_vect.fit_transform(test['tweet'])

In [None]:
#Naive Bayes Model

In [None]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y, test_tfidf, test['label'])
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y ,test_tfidf_ngram, test['label'])
print ("NB, Bi-Gram Vectors: ", accuracy)

f1 score:  0.8994145822245876
NB, WordLevel TF-IDF:  0.9055
f1 score:  0.5112474437627812
NB, Bi-Gram Vectors:  0.522


In [None]:
#Linear Classifier

In [None]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y, test_tfidf, test['label'])
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(),  xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y ,test_tfidf_ngram, test['label'])
print("LR, Bi-Gram Vectors: ", accuracy)

f1 score:  0.9114914425427874
LR, WordLevel TF-IDF:  0.9095
f1 score:  0.6756874727193365
LR, Bi-Gram Vectors:  0.6285


In [None]:
#Bagging Model

In [None]:

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y, test_tfidf, test['label'])
print ("RF, WordLevel TF-IDF: ", accuracy)

# RF on ngram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y ,test_tfidf_ngram, test['label'])
print ("RF, Bi-gram TF-IDF: ", accuracy)

f1 score:  0.9199388067312596
RF, WordLevel TF-IDF:  0.9215
f1 score:  0.5659276546091014
RF, Bi-gram TF-IDF:  0.628


In [None]:
#Boosting Model

In [None]:
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc(), valid_y, test_tfidf.tocsc(), test['label'])
print("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on ngram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc(), valid_y, test_tfidf_ngram.tocsc(), test['label'])
print("Xgb, Bi-gram TF-IDF: ", accuracy)

f1 score:  0.9118511263467189
Xgb, WordLevel TF-IDF:  0.91
f1 score:  0.6816745655608214
Xgb, Bi-gram TF-IDF:  0.597


In [None]:
#SVM Model

In [None]:
#SVM Model on Unigram TF-IDF
accuracy = train_model(svm.SVC(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc(), valid_y, test_tfidf.tocsc(), test['label'])
print("SVM, WordLevel TF-IDF: ", accuracy)

# SVM Model on Bigram TF-IDF
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc(), valid_y, test_tfidf_ngram.tocsc(), test['label'])
print("SVM, Bi-gram TF-IDF: ", accuracy)

f1 score:  0.9226219812715624
SVM, WordLevel TF-IDF:  0.9215
f1 score:  0.666366095581605
SVM, Bi-gram TF-IDF:  0.63
