# <font size=6px><center> SENTIMENT ANALYSIS FOR DEPRESSION DETECTION </center></font>

# DATA EXTRACTION

In [1]:
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

In [2]:
import api_keys
import json

In [3]:
auth=OAuthHandler(api_keys.consumer_key,api_keys.consumer_secret)
auth.set_access_token(api_keys.access_token,api_keys.access_secret)
api=tweepy.API(auth)

In [4]:
class std_listener(StreamListener):
    def on_data(self, data):
        with open('tweets_file.txt','a') as fp:
            fp.write(data)
            
        return True

    def on_error(self, status):
        print(status)

In [7]:
var=std_listener()
stream=Stream(auth,var)
stream.filter(track=['I\'m fine','I want to die','No one cares','I am sad','anxiety','suicide','I hate my life','I want to end my life','I feel sad','tired','helpless','hopeless','aching','lost','worthless','useless','stupid','stuck','adrift','hurting','alone','unsure','insecure','despair','I should','stressed'],languages=['en'])

KeyboardInterrupt: 

# PRE-PROCESSING

In [1]:
import pandas as pd
import json
import re
import string
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
raw_tweets_text=[]

def get_data(url):
    with open(url) as fp:
        for line in fp:
                if(line!="\n"):
                    tweet_as_dict=json.loads(line)
                    raw_tweets_text.append(tweet_as_dict['text'])
    convert_to_csv()

def convert_to_csv():    
    df=pd.DataFrame(raw_tweets_text)
    df.to_csv('tweets.csv')

In [3]:
get_data('tweets_file.txt')

In [4]:
d = pd.read_csv("tweets.csv")
d = d.drop("Unnamed: 0",axis=1)
d = d.rename(columns={"0":"Text"})
d.drop_duplicates(inplace=True)
d = d.reset_index(drop=True)
d.to_csv('tweets.csv')
d.head(10)

Unnamed: 0,Text
0,RT @sculpturejay: 🐆should we talk about i-land...
1,RT @abetokhi: Yes suicide is haram but making\...
2,"RT @geezybeatz145__: I am stressed, I am depre..."
3,RT @MillsReggie: always learn how to be strong...
4,@CATARllNA Should I add another one then? 🤔
5,RT @evalution_music: If you wanna see me at ba...
6,This is the underside of my world.Of course yo...
7,RT @empresslexiii: I dont care if you are work...
8,RT @Bryan62784488: Within the New York State P...
9,I'm literally tired


In [5]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6941 entries, 0 to 6940
Data columns (total 1 columns):
Text    6941 non-null object
dtypes: object(1)
memory usage: 54.4+ KB


In [6]:
def process_data(df):
    text_emoji = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
    text_links = re.compile('httpstco[a-zA-Z0-9]+')
    text_rts = re.compile('RT [A-Za-z0-9]+')
    text_schar = re.compile('[^A-Za-z0-9 ]+')
    text_nums = re.compile('[0-9]+')
    
    for i in range(len(df)):
        txt=df["Text"].iloc[i]
        txt=text_emoji.sub(r'',txt)
        temp=txt.translate(txt.maketrans('','',string.punctuation))
        temp=text_links.sub(r'',temp)
        temp=text_rts.sub(r'',temp)        
        temp=text_schar.sub(r'',temp)
        temp=text_nums.sub(r'',temp)
        temp = unicodedata.normalize('NFKD',temp).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        df["Text"].iloc[i] = temp.lower()
    return df   

In [7]:
df = process_data(d.copy())
df.head(10)

Unnamed: 0,Text
0,should we talk about ilandthere are surprisin...
1,yes suicide is haram but makingsomeones life ...
2,i am stressed i am depressed here we go again...
3,always learn how to be strong alone
4,catarllna should i add another one then
5,if you wanna see me at bass canyon or lost la...
6,this is the underside of my worldof course you...
7,i dont care if you are working i dont care if...
8,within the new york state park police alone ...
9,im literally tired


In [8]:
def further_ops(df):
    for i in range(len(df)):
        tweet = df["Text"].iloc[i]
        token = word_tokenize(tweet)
        token_no_sw = [word for word in token if word not in stopwords.words()]
        lemmatizer = WordNetLemmatizer()
        lemm_token = [lemmatizer.lemmatize(word) for word in token_no_sw]
        df["Text"].iloc[i] = " ".join(lemm_token)
    return df    

In [9]:
dft = further_ops(df.copy())
dft.head(10)

Unnamed: 0,Text
0,talk ilandthere surprisingly lot people like i...
1,yes suicide haram makingsomeones life miserabl...
2,stressed depressed go need somebody talk
3,always learn strong alone
4,catarllna add another
5,wan see bass canyon lost land fill survey thin...
6,underside worldof course dont stupid bles iiiii
7,dont working dont mall brain filled poison edg
8,within new york state park police alone office...
9,literally tired


In [10]:
index = dft[dft["Text"]==""].index
dft.drop(index,inplace=True)
d = dft.reset_index(drop="True")

In [11]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6920 entries, 0 to 6919
Data columns (total 1 columns):
Text    6920 non-null object
dtypes: object(1)
memory usage: 54.2+ KB


In [12]:
d.to_csv('processed_tweets.csv')

# FEATURE EXTRACTION

In [1]:
import pandas as pd
from textblob import TextBlob

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
d = pd.read_csv("processed_tweets.csv")
d = d.drop(["Unnamed: 0"],axis=1)
d.head(10)

Unnamed: 0,Text
0,talk ilandthere surprisingly lot people like i...
1,yes suicide haram makingsomeones life miserabl...
2,stressed depressed go need somebody talk
3,always learn strong alone
4,catarllna add another
5,see bass canyon lost land fill survey thingy v...
6,underside worldof course stupid bles iiiii
7,working mall brain filled poison edg
8,within new york state park police alone office...
9,I literally tired


In [3]:
d.drop_duplicates(inplace=True)

In [4]:
d.isna().sum()

Text    0
dtype: int64

In [5]:
d.dropna(inplace=True)

In [6]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6735 entries, 0 to 6918
Data columns (total 1 columns):
Text    6735 non-null object
dtypes: object(1)
memory usage: 105.2+ KB


In [7]:
n = len(d)
d["Sentiment"]=[None for i in range(n)]
for i in range(n):
    
    s = TextBlob(d["Text"].iloc[i]).sentiment
    if(s[0]>=0):
        d["Sentiment"].iloc[i] = 0
    else:
        d["Sentiment"].iloc[i] = 1

In [8]:
len(d[d["Sentiment"]==0]), len(d[d["Sentiment"]==1])

(4519, 2216)

In [9]:
no = len(d[d["Sentiment"]==1])

In [10]:
t = d[d["Sentiment"]==0][:no]
s = d[d["Sentiment"]==1]

In [11]:
df = pd.concat([s,t],ignore_index="True")
len(df)

4432

# MODEL DEVELOPMENT AND EVALUATION

# Splitting into Train and Test Data

In [12]:
from sklearn.model_selection import train_test_split

x= df["Text"]
y = df["Sentiment"].astype("int")
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [13]:
cv = CountVectorizer(ngram_range=(1,3))
tf = TfidfVectorizer(ngram_range=(1,3))
x1 = cv.fit_transform(x_train)
x2 = tf.fit_transform(x_train)

In [14]:
from sklearn import metrics

accuracy = {}
model = {}
vectorizer = {"CountVectorizer":cv,"TfidfVectorizer":tf}

# Support Vector Machine

### CountVectorizer

In [15]:
from sklearn import svm

classifier=svm.SVC()
classifier.fit(x1,y_train)
y_predict1=classifier.predict(cv.transform(x_test))
k = "Support Vector Machine with CountVectorizer"
model[k]=classifier

In [16]:
b1=metrics.accuracy_score(y_test, y_predict1)
recall = metrics.recall_score(y_test, y_predict1)
accuracy[k] = float("{0:.4f}".format(b1))
print(k)
print("Accuracy: ",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Support Vector Machine with CountVectorizer
Accuracy:  0.7813
Recall  : 0.6112


### TfidfVectorizer

In [17]:
classifier=svm.SVC()
classifier.fit(x2,y_train)
y_predict2=classifier.predict(tf.transform(x_test))
k = "Support Vector Machine with TfidfVectorizer"
model[k]=classifier

In [18]:
b2=metrics.accuracy_score(y_test, y_predict2)
recall = metrics.recall_score(y_test, y_predict2)
accuracy[k] = float("{0:.4f}".format(b2))
print(k)
print("Accuracy: ",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Support Vector Machine with TfidfVectorizer
Accuracy:  0.8196
Recall  : 0.7365


# Multinomial Naive Bayes

### Count Vectorizer

In [19]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(x1,y_train)
y_predict3=classifier.predict(cv.transform(x_test))
k = "Multinomial Naive Bayes with CountVectorizer"
model[k]=nb

In [20]:
b3=metrics.accuracy_score(y_test, y_predict3)
recall = metrics.recall_score(y_test, y_predict3)
accuracy[k] = float("{0:.4f}".format(b3))
print(k)
print("Accuracy: ",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Multinomial Naive Bayes with CountVectorizer
Accuracy:  0.5524
Recall  : 0.1469


### TfidfVectorizer

In [21]:
nb = MultinomialNB()
nb.fit(x2,y_train)
y_predict4=classifier.predict(tf.transform(x_test))
k = "Multinomial Naive Bayes with TfidfVectorizer"
model[k]=nb

In [22]:
b4=metrics.accuracy_score(y_test,y_predict4)
recall = metrics.recall_score(y_test, y_predict4)
accuracy[k] = float("{0:.4f}".format(b4))
print(k)
print("Accuracy: ",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Multinomial Naive Bayes with TfidfVectorizer
Accuracy:  0.8196
Recall  : 0.7365


# Decision Tree Classifier

### CountVectorizer

In [23]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(x1,y_train)
y_pred1 = dtc.predict(cv.transform(x_test))
k = "Decision Tree Classifier with CountVectorizer"
model[k]=dtc

In [24]:
a1 = metrics.accuracy_score(y_test, y_pred1)
recall = metrics.recall_score(y_test, y_pred1)
accuracy[k] = a1
print(k)
print("Accuracy: {0:.4f}".format(a1))
print("Recall  : {0:.4f}".format(recall))

Decision Tree Classifier with CountVectorizer
Accuracy: 0.9019
Recall  : 0.8985


### TfidfVectorizer

In [25]:
dtc = DecisionTreeClassifier()
dtc.fit(x2,y_train)
y_pred2 = dtc.predict(tf.transform(x_test))
k = "Decision Tree Classifier with TfidfVectorizer"
model[k]=dtc

In [26]:
a2 = metrics.accuracy_score(y_test, y_pred2)
recall = metrics.recall_score(y_test, y_pred2)
accuracy[k] = a2
print(k)
print("Accuracy: {0:.4f}".format(a2))
print("Recall  : {0:.4f}".format(recall))

Decision Tree Classifier with TfidfVectorizer
Accuracy: 0.8918
Recall  : 0.8877


# Random Forest Classifier

### CountVectorizer

In [27]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(x1,y_train)
y_pred3 = rfc.predict(cv.transform(x_test))
k = "Random Forest Classifier with CountVectorizer"
model[k]=rfc

In [28]:
a3 = metrics.accuracy_score(y_test, y_pred3)
recall = metrics.recall_score(y_test, y_pred3)
accuracy[k] = a3
print(k)
print("Accuracy: {0:.4f}".format(a3))
print("Recall  : {0:.4f}".format(recall))

Random Forest Classifier with CountVectorizer
Accuracy: 0.8433
Recall  : 0.7430


### TfidfVectorizer

In [29]:
rfc = RandomForestClassifier()
rfc.fit(x2,y_train)
y_pred4 = rfc.predict(tf.transform(x_test))
k = "Random Forest Classifier with TfidfVectorizer"
model[k]=rfc

In [30]:
a4 = metrics.accuracy_score(y_test, y_pred4)
recall = metrics.recall_score(y_test, y_pred4)
accuracy[k] = a4
print(k)
print("Accuracy: {0:.4f}".format(a4))
print("Recall  : {0:.4f}".format(recall))

Random Forest Classifier with TfidfVectorizer
Accuracy: 0.8422
Recall  : 0.7603


# K Nearest Neighbors

## CountVectorizer

In [31]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier()
classifier.fit(x1,y_train)
Y_pred1 = classifier.predict(cv.transform(x_test))
k = "KNeighborsClassifier with CountVectorizer"
model[k]=classifier

In [32]:
c1 = metrics.accuracy_score(y_test, Y_pred1)
recall = metrics.recall_score(y_test, Y_pred1)
accuracy[k] = c1
print(k)
print("Accuracy: {0:.4f}".format(c1))
print("Recall  : {0:.4f}".format(recall))

KNeighborsClassifier with CountVectorizer
Accuracy: 0.5829
Recall  : 0.2311


## TfidfVectorizer

In [33]:
classifier = KNeighborsClassifier()
classifier.fit(x2,y_train)
Y_pred2 = classifier.predict(tf.transform(x_test))
k = "KNeighborsClassifier with TfidfVectorizer"
model[k]=classifier

In [34]:
c2 = metrics.accuracy_score(y_test, Y_pred2)
recall = metrics.recall_score(y_test, Y_pred2)
accuracy[k] = c2
print(k)
print("Accuracy: {0:.4f}".format(c2))
print("Recall  : {0:.4f}".format(recall))

KNeighborsClassifier with TfidfVectorizer
Accuracy: 0.7294
Recall  : 0.7430


# Logistic Regression

### CountVectorizer

In [35]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x1,y_train)
Y_pred3 = lr.predict(cv.transform(x_test))
k = "Logistic Regression with CountVectorizer"
model[k]=lr

In [36]:
c3 = metrics.accuracy_score(y_test, Y_pred3)
recall = metrics.recall_score(y_test, Y_pred3)
accuracy[k] = c3
print(k)
print("Accuracy: {0:.4f}".format(c3))
print("Recall  : {0:.4f}".format(recall))

Logistic Regression with CountVectorizer
Accuracy: 0.8286
Recall  : 0.7257


### TfidfVectorizer

In [37]:
lr = LogisticRegression()
lr.fit(x2,y_train)
Y_pred4 = lr.predict(tf.transform(x_test))
k = "Logistic Regression with TfidfVectorizer"
model[k]=lr

In [38]:
c4 = metrics.accuracy_score(y_test, Y_pred4)
recall = metrics.recall_score(y_test, Y_pred4)
accuracy[k] = c4
print(k)
print("Accuracy: {0:.4f}".format(c4))
print("Recall  : {0:.4f}".format(recall))

Logistic Regression with TfidfVectorizer
Accuracy: 0.8253
Recall  : 0.7732


In [39]:
sorted(accuracy)
ad = pd.DataFrame({"Accuracy":accuracy})
ad

Unnamed: 0,Accuracy
Decision Tree Classifier with CountVectorizer,0.901917
Decision Tree Classifier with TfidfVectorizer,0.89177
KNeighborsClassifier with CountVectorizer,0.582864
KNeighborsClassifier with TfidfVectorizer,0.729425
Logistic Regression with CountVectorizer,0.828636
Logistic Regression with TfidfVectorizer,0.825254
Multinomial Naive Bayes with CountVectorizer,0.5524
Multinomial Naive Bayes with TfidfVectorizer,0.8196
Random Forest Classifier with CountVectorizer,0.843292
Random Forest Classifier with TfidfVectorizer,0.842165


In [40]:
m = max(accuracy,key=accuracy.get)
a = accuracy[m]
print("The model with highest accuracy is",m,"with an Accuracy of","{:.2f}".format(a*100),"%")

The model with highest accuracy is Decision Tree Classifier with CountVectorizer with an Accuracy of 90.19 %


In [41]:
import pickle

if("CountVectorizer" in m):
    v = vectorizer["CountVectorizer"]
elif("TfidfVectorizer" in m):
    v = vectorizer["TfidfVectorizer"]

pickle.dump(v,open("vectorizer.pkl","wb"))
pickle.dump(model[m],open("model.pkl","wb"))