In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("D:\SentimentAnalysys\good_tweets_combined_2.csv")
df.head()

Unnamed: 0,ID,Tweet,target
0,1,Be happy. Be confident. Be kind.\n\n #Kissable...,0
1,2,@queenjlouise @mamaw_gereck awe thanks ðŸ˜Š,0
2,3,@Ithika Loving it! ðŸ˜‚,0
3,4,@mir_btc Eherium should make everybody happy,0
4,5,Shitty is the worst feeling ever #depressed #a...,1


In [3]:
df[df['target']==0].head()

Unnamed: 0,ID,Tweet,target
0,1,Be happy. Be confident. Be kind.\n\n #Kissable...,0
1,2,@queenjlouise @mamaw_gereck awe thanks ðŸ˜Š,0
2,3,@Ithika Loving it! ðŸ˜‚,0
3,4,@mir_btc Eherium should make everybody happy,0
5,6,"I didn't know, that there existed such sheer a...",0


In [4]:
df[df['target']==1].head()

Unnamed: 0,ID,Tweet,target
4,5,Shitty is the worst feeling ever #depressed #a...,1
27,28,better care about yourself than about others b...,1
50,51,when you try your best but you don't suceed!!!...,1
65,66,It's #AmazonPrimeDay! And I'm #broke. So in re...,1
77,78,Mentally suffered #iwanttodie #worthless #life...,1


In [5]:
#preprocessing


#url is not done yet

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt

In [6]:
#remove twitter handles (@user)
#vectorize means pass each row to the function and return
df['clean_tweet'] = np.vectorize(remove_pattern)(df['Tweet'], "@[\w]*")

In [7]:
df.head()

Unnamed: 0,ID,Tweet,target,clean_tweet
0,1,Be happy. Be confident. Be kind.\n\n #Kissable...,0,Be happy. Be confident. Be kind.\n\n #Kissable...
1,2,@queenjlouise @mamaw_gereck awe thanks ðŸ˜Š,0,awe thanks ðŸ˜Š
2,3,@Ithika Loving it! ðŸ˜‚,0,Loving it! ðŸ˜‚
3,4,@mir_btc Eherium should make everybody happy,0,Eherium should make everybody happy
4,5,Shitty is the worst feeling ever #depressed #a...,1,Shitty is the worst feeling ever #depressed #a...


In [8]:
#remove special characters, numbers and punctuations
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]", " ")
df.head()

Unnamed: 0,ID,Tweet,target,clean_tweet
0,1,Be happy. Be confident. Be kind.\n\n #Kissable...,0,Be happy Be confident Be kind n n #Kissable...
1,2,@queenjlouise @mamaw_gereck awe thanks ðŸ˜Š,0,awe thanks
2,3,@Ithika Loving it! ðŸ˜‚,0,Loving it
3,4,@mir_btc Eherium should make everybody happy,0,Eherium should make everybody happy
4,5,Shitty is the worst feeling ever #depressed #a...,1,Shitty is the worst feeling ever #depressed #a...


In [9]:
#remove short words
#if length is more than 3 then it will be added to string otherwise it will be ignored
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
df.head()

Unnamed: 0,ID,Tweet,target,clean_tweet
0,1,Be happy. Be confident. Be kind.\n\n #Kissable...,0,happy confident kind #KissablesLoveSMShopmag n...
1,2,@queenjlouise @mamaw_gereck awe thanks ðŸ˜Š,0,thanks
2,3,@Ithika Loving it! ðŸ˜‚,0,Loving
3,4,@mir_btc Eherium should make everybody happy,0,Eherium should make everybody happy
4,5,Shitty is the worst feeling ever #depressed #a...,1,Shitty worst feeling ever #depressed #anxiety


In [10]:
#tokenization (individual word called tokens)
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [happy, confident, kind, #KissablesLoveSMShopm...
1                                             [thanks]
2                                             [Loving]
3            [Eherium, should, make, everybody, happy]
4    [Shitty, worst, feeling, ever, #depressed, #an...
Name: clean_tweet, dtype: object

In [11]:
#stem the words
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized_tweet.head()

0    [happi, confid, kind, #kissableslovesmshopmag,...
1                                              [thank]
2                                               [love]
3            [eherium, should, make, everybodi, happi]
4      [shitti, worst, feel, ever, #depress, #anxieti]
Name: clean_tweet, dtype: object

In [12]:
#combine all words into sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = " ".join(tokenized_tweet[i])
    
df['clean_tweet'] = tokenized_tweet
df.head()

Unnamed: 0,ID,Tweet,target,clean_tweet
0,1,Be happy. Be confident. Be kind.\n\n #Kissable...,0,happi confid kind #kissableslovesmshopmag nall...
1,2,@queenjlouise @mamaw_gereck awe thanks ðŸ˜Š,0,thank
2,3,@Ithika Loving it! ðŸ˜‚,0,love
3,4,@mir_btc Eherium should make everybody happy,0,eherium should make everybodi happi
4,5,Shitty is the worst feeling ever #depressed #a...,1,shitti worst feel ever #depress #anxieti


In [13]:
from nltk.corpus import stopwords

In [14]:
#print(set(stopwords.words('english')))

In [15]:
stop_words = set(stopwords.words('english'))

In [16]:
df['clean_tweet'] = df['clean_tweet'].apply(lambda x : ' '.join([w for w in x.split() if not w in stop_words]))

In [17]:
df.head()

Unnamed: 0,ID,Tweet,target,clean_tweet
0,1,Be happy. Be confident. Be kind.\n\n #Kissable...,0,happi confid kind #kissableslovesmshopmag nall...
1,2,@queenjlouise @mamaw_gereck awe thanks ðŸ˜Š,0,thank
2,3,@Ithika Loving it! ðŸ˜‚,0,love
3,4,@mir_btc Eherium should make everybody happy,0,eherium make everybodi happi
4,5,Shitty is the worst feeling ever #depressed #a...,1,shitti worst feel ever #depress #anxieti


In [18]:
#visulization

In [19]:
df.shape #number of (raws, columns)

(4391, 4)

In [20]:
#tfifd

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tfidf_vectorizer = TfidfVectorizer(max_df = 0.9 , min_df = 2 , stop_words = 'english' , max_features = 2000)
dff = tfidf_vectorizer.fit_transform(df['clean_tweet'])

In [23]:
dff.shape

(4391, 2000)

In [24]:
#train_x = dff[:3300 , :]
#test_x = dff[3300: , :]

In [28]:
#train_y = df['target'][:3300]
#test_y = df['target'][3300:]

X = dff
Y = df['target']

In [29]:
from sklearn.model_selection import train_test_split

#train_y = Y[:3300]
#test_y = Y[3300:]
X_train, X_test, Y_train, Y_test = train_test_split(dff,df['target'], test_size = 0.20, random_state = 40)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(3512, 2000) (3512,)
(879, 2000) (879,)


In [30]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train , Y_train)
pred = lr.predict(X)

In [31]:
from sklearn import metrics
metrics.confusion_matrix(pred , Y)

array([[3425,  572],
       [  63,  331]], dtype=int64)

In [32]:
metrics.accuracy_score(pred , Y)

0.8553860168526531

In [33]:
print(metrics.classification_report(pred , Y))

              precision    recall  f1-score   support

           0       0.98      0.86      0.92      3997
           1       0.37      0.84      0.51       394

    accuracy                           0.86      4391
   macro avg       0.67      0.85      0.71      4391
weighted avg       0.93      0.86      0.88      4391



In [34]:
from sklearn.preprocessing import binarize
y_prob=lr.predict_proba(X)[:,1]
pred1=binarize([y_prob],0.45)[0]
print(metrics.confusion_matrix(pred1,Y))

[[3399  495]
 [  89  408]]


In [35]:
metrics.accuracy_score(pred1 , Y)

0.8670006832156684

In [36]:
print(metrics.classification_report(pred1 , Y))

              precision    recall  f1-score   support

         0.0       0.97      0.87      0.92      3894
         1.0       0.45      0.82      0.58       497

    accuracy                           0.87      4391
   macro avg       0.71      0.85      0.75      4391
weighted avg       0.92      0.87      0.88      4391



In [34]:
#def preproforsentence(text):
#    ans = lr.predict(text)
#    return ans"""

In [35]:
#res = preproforsentence("depression hate life sadness")
#print(res)