# FEATURE EXTRACTION

In [1]:
import pandas as pd
from textblob import TextBlob

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
d = pd.read_csv("Processed_tweets.csv")
d = d.drop(["Unnamed: 0"],axis=1)
d.head(10)

Unnamed: 0,Text
0,talk ilandthere surprisingly lot people like i...
1,yes suicide haram makingsomeones life miserabl...
2,stressed depressed go need somebody talk
3,always learn strong alone
4,catarllna add another
5,see bass canyon lost land fill survey thingy v...
6,underside worldof course stupid bles iiiii
7,working mall brain filled poison edg
8,within new york state park police alone office...
9,I literally tired


In [3]:
d.drop_duplicates(inplace=True)

In [4]:
d.isna().sum()

Text    0
dtype: int64

In [5]:
d.dropna(inplace=True)

In [6]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6735 entries, 0 to 6918
Data columns (total 1 columns):
Text    6735 non-null object
dtypes: object(1)
memory usage: 105.2+ KB


In [7]:
n = len(d)
d["Sentiment"]=[None for i in range(n)]
for i in range(n):
    
    s = TextBlob(d["Text"].iloc[i]).sentiment
    if(s[0]>=0):
        d["Sentiment"].iloc[i] = 0
    else:
        d["Sentiment"].iloc[i] = 1

In [8]:
len(d[d["Sentiment"]==0]), len(d[d["Sentiment"]==1])

(4519, 2216)

In [9]:
no = len(d[d["Sentiment"]==1])

In [10]:
t = d[d["Sentiment"]==0][:no]
s = d[d["Sentiment"]==1]

In [11]:
df = pd.concat([s,t],ignore_index="True")
len(df)

4432

# MODEL DEVELOPMENT AND EVALUATION

# Splitting into Train and Test Data

In [12]:
from sklearn.model_selection import train_test_split

x= df["Text"]
y = df["Sentiment"].astype("int")
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [13]:
cv = CountVectorizer(ngram_range=(1,3))
tf = TfidfVectorizer(ngram_range=(1,3))
x1 = cv.fit_transform(x_train)
x2 = tf.fit_transform(x_train)

In [14]:
from sklearn import metrics

accuracy = {}
model = {}
vectorizer = {"CountVectorizer":cv,"TfidfVectorizer":tf}

# Support Vector Machine

### CountVectorizer

In [15]:
from sklearn import svm

classifier=svm.SVC()
classifier.fit(x1,y_train)
y_predict1=classifier.predict(cv.transform(x_test))
k = "Support Vector Machine with CountVectorizer"
model[k]=classifier

In [16]:
b1=metrics.accuracy_score(y_test, y_predict1)
recall = metrics.recall_score(y_test, y_predict1)
accuracy[k] = float("{0:.4f}".format(b1))
print(k)
print("Accuracy: ",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Support Vector Machine with CountVectorizer
Accuracy:  0.7813
Recall  : 0.6112


### TfidfVectorizer

In [17]:
classifier=svm.SVC()
classifier.fit(x2,y_train)
y_predict2=classifier.predict(tf.transform(x_test))
k = "Support Vector Machine with TfidfVectorizer"
model[k]=classifier

In [18]:
b2=metrics.accuracy_score(y_test, y_predict2)
recall = metrics.recall_score(y_test, y_predict2)
accuracy[k] = float("{0:.4f}".format(b2))
print(k)
print("Accuracy: ",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Support Vector Machine with TfidfVectorizer
Accuracy:  0.8196
Recall  : 0.7365


# Multinomial Naive Bayes

### Count Vectorizer

In [19]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(x1,y_train)
y_predict3=classifier.predict(cv.transform(x_test))
k = "Multinomial Naive Bayes with CountVectorizer"
model[k]=nb

In [20]:
b3=metrics.accuracy_score(y_test, y_predict3)
recall = metrics.recall_score(y_test, y_predict3)
accuracy[k] = float("{0:.4f}".format(b3))
print(k)
print("Accuracy: ",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Multinomial Naive Bayes with CountVectorizer
Accuracy:  0.5524
Recall  : 0.1469


### TfidfVectorizer

In [21]:
nb = MultinomialNB()
nb.fit(x2,y_train)
y_predict4=classifier.predict(tf.transform(x_test))
k = "Multinomial Naive Bayes with TfidfVectorizer"
model[k]=nb

In [22]:
b4=metrics.accuracy_score(y_test,y_predict4)
recall = metrics.recall_score(y_test, y_predict4)
accuracy[k] = float("{0:.4f}".format(b4))
print(k)
print("Accuracy: ",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Multinomial Naive Bayes with TfidfVectorizer
Accuracy:  0.8196
Recall  : 0.7365


In [23]:
sorted(accuracy)
ad = pd.DataFrame({"Accuracy":accuracy})
ad

Unnamed: 0,Accuracy
Multinomial Naive Bayes with CountVectorizer,0.5524
Multinomial Naive Bayes with TfidfVectorizer,0.8196
Support Vector Machine with CountVectorizer,0.7813
Support Vector Machine with TfidfVectorizer,0.8196
