In [1]:
# import some helping libraries
import pandas as pd                         
import numpy as np

In [2]:
# reading the data
df1 = pd.read_csv("depression_dataset.csv")
df1

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1
...,...,...
7726,is that snow,0
7727,moulin rouge mad me cry once again,0
7728,trying to shout but can t find people on the list,0
7729,ughh can t find my red sox hat got ta wear thi...,0


In [3]:
df1= df1.rename(columns={"clean_text": "text", "is_depression": "label"})

In [4]:
# reading the data
df2 = pd.read_csv("unbalanced_depression.csv")

In [5]:
df2 = df2.drop(columns=["Unnamed: 0"])

In [6]:
df2 = df2.drop(columns=["Unnamed: 0.1"])

In [7]:
df2 = df2.rename(columns={"message": "text", "label": "label"})

In [8]:
data = [df1,df2]

In [9]:
df = pd.concat(data)
df.head()

Unnamed: 0,text,label
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [10]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [11]:
# counting records for each class
df["label"].value_counts() 

0    11885
1     6048
Name: label, dtype: int64

In [12]:
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

In [13]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
# count the length of the text to compare after cleaning
df['length'] = df['text'].apply(lambda x: len(x))
df.head()

Unnamed: 0,text,label,length
0,we understand that most people who reply immed...,1,4535
1,welcome to r depression s check in post a plac...,1,2394
2,anyone else instead of sleeping more when depr...,1,236
3,i ve kind of stuffed around a lot in my life d...,1,510
4,sleep is my greatest and most comforting escap...,1,281


In [15]:
# Converting text to lower case
def convert_to_lower_remove_links_hashtags(text):
    temp = text.lower()
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    return temp

df['text'] = df['text'].apply(lambda x: convert_to_lower_remove_links_hashtags(x))

In [16]:
#Removing numbers and other numeric values function
def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

df['text'] = df['text'].apply(lambda x: remove_numbers(x))

In [17]:
# Removing punctuations
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['text'] = df['text'].apply(lambda x: remove_punctuation(x))

In [18]:
# Removing stop words
from nltk.corpus import stopwords

def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

df['text'] = df['text'].apply(lambda x: remove_stopwords(x))

In [19]:
# Remove extra white spaces
def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

df['text'] = df['text'].apply(lambda x: remove_extra_white_spaces(x))

In [20]:
# Lemmatizing function
def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)

df['text'] = df['text'].apply(lambda x: lemmatizing(x))

In [21]:
# now compare after cleaning 
df['length_after_cleaning'] = df['text'].apply(lambda x: len(x))
df.head()

Unnamed: 0,text,label,length,length_after_cleaning
0,understand people reply immediately op invitat...,1,4535,3047
1,welcome depression check post place take momen...,1,2394,1549
2,anyone else instead sleeping depressed stay ni...,1,236,153
3,kind stuffed around lot life delaying inevitab...,1,510,296
4,sleep greatest comforting escape whenever wake...,1,281,162


In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3)

In [24]:
# test data
a = pd.DataFrame(X_test)
b = pd.DataFrame(y_test)
test_data = pd.concat([a, b], ignore_index=True, axis=1)
test_data= test_data.rename(columns = {0:'text', 1:'label'})
test_data

Unnamed: 0,text,label
4901,going toetally clean house bathroom kitchen be...,0
707,today wa late high dchool accident decided cha...,1
2328,depressed depression think lot live life actua...,1
2093,later work money money money make funny,0
949,byl,0
...,...,...
4511,followsavvy never found everytime click twitte...,0
8333,exam tomorrow ill go school even though im bit...,1
602,going bed got ta lot energy play little man pooch,0
5697,thank much following u one first couple follow...,0


In [25]:
from collections import Counter
from imblearn.over_sampling import SMOTE

In [26]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn import metrics

## Count vectorizer

In [27]:
'''
vectorizer = CountVectorizer( )
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)
'''

'\nvectorizer = CountVectorizer( )\nX_train_vect = vectorizer.fit_transform(X_train)\nX_test_vect = vectorizer.transform(X_test)\n'

## TF-IDF vectorizer

In [28]:
vectorizer = TfidfVectorizer( )
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

## Data balancing

In [29]:
sm = SMOTE()
x_train_sm, y_train_sm = sm.fit_resample(X_train_vect, y_train)
x_test_sm, y_test_sm = sm.fit_resample(X_test_vect, y_test)

In [30]:
from sklearn import model_selection
from sklearn.model_selection import KFold, cross_val_score

k_folds = model_selection.KFold(n_splits=10)

## Model building

# SVM

In [31]:
#"""                                                  #####################GOOOOOOOOOOOOOOOOOOOOOD
clf = SGDClassifier()
clf.fit(x_train_sm,y_train_sm)
#"""

# NaiveBayes

In [32]:
"""
clf = MultinomialNB()
clf.fit(x_train_sm,y_train_sm)
"""

'\nclf = MultinomialNB()\nclf.fit(x_train_sm,y_train_sm)\n'

# K-NN

In [33]:
"""
clf = KNeighborsClassifier()
clf.fit(x_train_sm,y_train_sm)
"""

'\nclf = KNeighborsClassifier()\nclf.fit(x_train_sm,y_train_sm)\n'

In [34]:
y_pred = clf.predict(x_test_sm)

In [35]:
#'''
def print_report(clf, x_test, y_test):
    y_pred = clf.predict(x_test_sm)
    report = metrics.classification_report(y_test_sm, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test_sm, y_pred)))

print_report(clf, X_test , y_test)
#'''

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      3560
           1       0.96      0.87      0.92      3560

    accuracy                           0.92      7120
   macro avg       0.92      0.92      0.92      7120
weighted avg       0.92      0.92      0.92      7120

accuracy: 0.920


# tried but not for text 

In [36]:
"""
clf = DecisionTreeClassifier()
clf.fit(x_train_sm,y_train_sm)
"""

'\nclf = DecisionTreeClassifier()\nclf.fit(x_train_sm,y_train_sm)\n'

In [37]:
"""
clf = LogisticRegression()
clf.fit(x_train_sm,y_train_sm)
"""

'\nclf = LogisticRegression()\nclf.fit(x_train_sm,y_train_sm)\n'

In [38]:
"""
clf = RandomForestClassifier()
clf.fit(x_train_sm,y_train_sm)
"""

'\nclf = RandomForestClassifier()\nclf.fit(x_train_sm,y_train_sm)\n'

## Saving Vectorizer

In [39]:
#'''
import pickle
pickle_out = open("TFidfvectorizer.pkl","wb")
pickle.dump(vectorizer, pickle_out)
pickle_out.close()
#'''

In [40]:
'''
import pickle
pickle_out = open("CountVectorizer.pkl","wb")
pickle.dump(vectorizer, pickle_out)
pickle_out.close()
'''

'\nimport pickle\npickle_out = open("CountVectorizer.pkl","wb")\npickle.dump(vectorizer, pickle_out)\npickle_out.close()\n'

## Model saving

In [41]:
'''
import pickle
pickle_out = open("SGDClassifierSVM.pkl","wb")
pickle.dump(clf, pickle_out)
pickle_out.close()
'''

'\nimport pickle\npickle_out = open("SGDClassifierSVM.pkl","wb")\npickle.dump(clf, pickle_out)\npickle_out.close()\n'

## Testing the model

In [42]:
text = " i am depressed"
text = vectorizer.transform([text])
prediction = clf.predict(text)
prediction[0]

1