# FEATURE EXTRACTION

In [2]:
import pandas as pd
from textblob import TextBlob

In [3]:
d = pd.read_csv("processed_tweets.csv")
d = d.drop(["Unnamed: 0","index"],axis=1)
d = d.rename(columns={"0":"Text"})
d.head(10)

Unnamed: 0,Text
0,account grow difficult keep track people activ...
1,release lyric video collab song bipolar sunshi...
2,stargalaxy genuinely cant even imagine second ...
3,theres call call spark call world always
4,scary plausible
5,track trace utterly useless proof covid spread...
6,camp never lose faith team proud aggie remembe...
7,bhpv merge bhel say announce lok sabha ncbn lo...
8,check result last two weeks arround time sum n...
9,see cabinet minister speak television deep thi...


In [4]:
d.isna().sum()

Text    0
dtype: int64

In [5]:
d.dropna(inplace=True)

In [6]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6294 entries, 0 to 6293
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    6294 non-null   object
dtypes: object(1)
memory usage: 98.3+ KB


In [7]:
n = len(d)
d["Sentiment"]=[None for i in range(n)]
for i in range(n):
    s = TextBlob(d["Text"].iloc[i]).sentiment
    if(s[0]>0):
        d["Sentiment"].iloc[i] = 0
    else:
        d["Sentiment"].iloc[i] = 1
d
#0 sentiment means positive, 1 sentiment means negative

Unnamed: 0,Text,Sentiment
0,account grow difficult keep track people activ...,1
1,release lyric video collab song bipolar sunshi...,0
2,stargalaxy genuinely cant even imagine second ...,0
3,theres call call spark call world always,1
4,scary plausible,1
...,...,...
6289,yes champ tysonfury,1
6290,wonderful sikh taxi driver nyc mention previou...,0
6291,tire fellow brights bring back tl bbrightvcbbr...,1
6292,reminder onces dont mind haters theyll always ...,1


In [8]:
len(d[d["Sentiment"]==0]), len(d[d["Sentiment"]==1])

(2153, 4141)

In [9]:
no = len(d[d["Sentiment"]==0])
no

2153

In [10]:
t = d[d["Sentiment"]==0]
s = d[d["Sentiment"]==1][:no]

In [11]:
df = pd.concat([s,t],ignore_index="True")
len(df)

4306

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cv = CountVectorizer()
tf = TfidfVectorizer()
x1 = cv.fit_transform(df["Text"])
x2 = tf.fit_transform(df["Text"])
y = df["Sentiment"]
y=y.astype('int')
x1

<4306x8829 sparse matrix of type '<class 'numpy.int64'>'
	with 38279 stored elements in Compressed Sparse Row format>

# MODEL DEVELOPMENT AND EVALUATION

# Splitting into Train and Test Data

In [13]:
from sklearn.model_selection import train_test_split

x_train,x_test, y_train, y_test = train_test_split(x1,y,test_size=0.18,random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(x2,y,test_size=0.2,random_state=42)

# Model Training and Testing

In [14]:
from sklearn import metrics

accuracy = {}
model = {}
vectorizer = {"CountVectorizer":cv,"TfidfVectorizer":tf}

# Decision Tree Classifier

### CountVectorizer

In [15]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
y_pred1 = dtc.predict(x_test)
k = "Decision Tree Classifier with CountVectorizer"
model[k] = dtc

In [16]:
a1 = metrics.accuracy_score(y_test, y_pred1)
recall = metrics.recall_score(y_test, y_pred1)
accuracy[k] = float("{0:.4f}".format(a1))
print(k)
print("Accuracy:",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Decision Tree Classifier with CountVectorizer
Accuracy: 0.9162
Recall  : 0.9373


### TfidfVectorizer

In [1]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,Y_train)
Y_pred1 = dtc.predict(X_test)
k = "Decision Tree Classifier with TfidfVectorizer"
model[k] = dtc

NameError: name 'DecisionTreeClassifier' is not defined

In [None]:
a2 = metrics.accuracy_score(Y_test, Y_pred1)
recall = metrics.recall_score(Y_test, Y_pred1)
accuracy[k] = float("{0:.4f}".format(a2))
print(k)
print("Accuracy:",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Decision Tree Classifier with TfidfVectorizer
Accuracy: 0.9094
Recall  : 0.9026


# Random Forest Classifier

### CountVectorizer

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
y_pred2 = rfc.predict(x_test)
k = "Random Forest Classifier with CountVectorizer"
model[k] = rfc

In [None]:
a3 = metrics.accuracy_score(y_test, y_pred2)
recall = metrics.recall_score(y_test, y_pred2)
accuracy[k] = float("{0:.4f}".format(a3))
print(k)
print("Accuracy:",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Random Forest Classifier with CountVectorizer
Accuracy: 0.914
Recall  : 0.8739


### TfidfVectorizer

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)
Y_pred2 = rfc.predict(X_test)
k = "Random Forest Classifier with TfidfVectorizer"
model[k] = rfc

In [None]:
a4 = metrics.accuracy_score(Y_test, Y_pred2)
recall = metrics.recall_score(Y_test, Y_pred2)
accuracy[k] = float("{0:.4f}".format(a4))
print(k)
print("Accuracy:",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Random Forest Classifier with TfidfVectorizer
Accuracy: 0.9124
Recall  : 0.8653


In [None]:
ad={}
for i in accuracy:
    ad[i] = {"Accuracy":accuracy[i]}
ad = pd.DataFrame(ad)
ad

Unnamed: 0,Decision Tree Classifier with CountVectorizer,Decision Tree Classifier with TfidfVectorizer,Random Forest Classifier with CountVectorizer,Random Forest Classifier with TfidfVectorizer
Accuracy,0.9232,0.9094,0.914,0.9124
