# FEATURE EXTRACTION

In [1]:
import pandas as pd
from textblob import TextBlob

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
d = pd.read_csv("Processed_tweets.csv")
d = d.drop(["Unnamed: 0"],axis=1)
d.head(10)

Unnamed: 0,Text
0,talk ilandthere surprisingly lot people like i...
1,yes suicide haram makingsomeones life miserabl...
2,stressed depressed go need somebody talk
3,always learn strong alone
4,catarllna add another
5,see bass canyon lost land fill survey thingy v...
6,underside worldof course stupid bles iiiii
7,working mall brain filled poison edg
8,within new york state park police alone office...
9,I literally tired


In [3]:
d.drop_duplicates(inplace=True)

In [4]:
d.isna().sum()

Text    0
dtype: int64

In [5]:
d.dropna(inplace=True)

In [6]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6735 entries, 0 to 6918
Data columns (total 1 columns):
Text    6735 non-null object
dtypes: object(1)
memory usage: 105.2+ KB


In [7]:
n = len(d)
d["Sentiment"]=[None for i in range(n)]
for i in range(n):
    
    s = TextBlob(d["Text"].iloc[i]).sentiment
    if(s[0]>=0):
        d["Sentiment"].iloc[i] = 0
    else:
        d["Sentiment"].iloc[i] = 1

In [8]:
len(d[d["Sentiment"]==0]), len(d[d["Sentiment"]==1])

(4519, 2216)

In [9]:
no = len(d[d["Sentiment"]==1])

In [10]:
t = d[d["Sentiment"]==0][:no]
s = d[d["Sentiment"]==1]

In [11]:
df = pd.concat([s,t],ignore_index="True")
len(df)

4432

# MODEL DEVELOPMENT AND EVALUATION

# Splitting into Train and Test Data

In [12]:
from sklearn.model_selection import train_test_split

x= df["Text"]
y = df["Sentiment"].astype("int")
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [13]:
cv = CountVectorizer(ngram_range=(1,3))
tf = TfidfVectorizer(ngram_range=(1,3))
x1 = cv.fit_transform(x_train)
x2 = tf.fit_transform(x_train)

In [14]:
from sklearn import metrics

accuracy = {}
model = {}
vectorizer = {"CountVectorizer":cv,"TfidfVectorizer":tf}

# Decision Tree Classifier

### CountVectorizer

In [15]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(x1,y_train)
y_pred1 = dtc.predict(cv.transform(x_test))
k = "Decision Tree Classifier with CountVectorizer"
model[k]=dtc

In [16]:
a1 = metrics.accuracy_score(y_test, y_pred1)
recall = metrics.recall_score(y_test, y_pred1)
accuracy[k] = a1
print(k)
print("Accuracy: {0:.4f}".format(a1))
print("Recall  : {0:.4f}".format(recall))

Decision Tree Classifier with CountVectorizer
Accuracy: 0.9008
Recall  : 0.8942


### TfidfVectorizer

In [17]:
dtc = DecisionTreeClassifier()
dtc.fit(x2,y_train)
y_pred2 = dtc.predict(tf.transform(x_test))
k = "Decision Tree Classifier with TfidfVectorizer"
model[k]=dtc

In [18]:
a2 = metrics.accuracy_score(y_test, y_pred2)
recall = metrics.recall_score(y_test, y_pred2)
accuracy[k] = a2
print(k)
print("Accuracy: {0:.4f}".format(a2))
print("Recall  : {0:.4f}".format(recall))

Decision Tree Classifier with TfidfVectorizer
Accuracy: 0.8963
Recall  : 0.8963


# Random Forest Classifier

### CountVectorizer

In [19]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(x1,y_train)
y_pred3 = rfc.predict(cv.transform(x_test))
k = "Random Forest Classifier with CountVectorizer"
model[k]=rfc

In [20]:
a3 = metrics.accuracy_score(y_test, y_pred3)
recall = metrics.recall_score(y_test, y_pred3)
accuracy[k] = a3
print(k)
print("Accuracy: {0:.4f}".format(a3))
print("Recall  : {0:.4f}".format(recall))

Random Forest Classifier with CountVectorizer
Accuracy: 0.8579
Recall  : 0.7754


### TfidfVectorizer

In [21]:
rfc = RandomForestClassifier()
rfc.fit(x2,y_train)
y_pred4 = rfc.predict(tf.transform(x_test))
k = "Random Forest Classifier with TfidfVectorizer"
model[k]=rfc

In [22]:
a4 = metrics.accuracy_score(y_test, y_pred4)
recall = metrics.recall_score(y_test, y_pred4)
accuracy[k] = a4
print(k)
print("Accuracy: {0:.4f}".format(a4))
print("Recall  : {0:.4f}".format(recall))

Random Forest Classifier with TfidfVectorizer
Accuracy: 0.8399
Recall  : 0.7711


In [23]:
sorted(accuracy)
ad = pd.DataFrame({"Accuracy":accuracy})
ad

Unnamed: 0,Accuracy
Decision Tree Classifier with CountVectorizer,0.900789
Decision Tree Classifier with TfidfVectorizer,0.89628
Random Forest Classifier with CountVectorizer,0.857948
Random Forest Classifier with TfidfVectorizer,0.83991
