# FEATURE EXTRACTION

In [None]:
import pandas as pd
from textblob import TextBlob

In [None]:
d = pd.read_csv("processed_tweets.csv")
d = d.drop(["Unnamed: 0","index"],axis=1)
d = d.rename(columns={"0":"Text"})
d.head(10)

Unnamed: 0,Text
0,account grow difficult keep track people activ...
1,release lyric video collab song bipolar sunshi...
2,stargalaxy genuinely cant even imagine second ...
3,theres call call spark call world always
4,scary plausible
5,track trace utterly useless proof covid spread...
6,camp never lose faith team proud aggie remembe...
7,bhpv merge bhel say announce lok sabha ncbn lo...
8,check result last two weeks arround time sum n...
9,see cabinet minister speak television deep thi...


In [None]:
d.isna().sum()

Text    0
dtype: int64

In [None]:
d.dropna(inplace=True)

In [None]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6294 entries, 0 to 6293
Data columns (total 1 columns):
Text    6294 non-null object
dtypes: object(1)
memory usage: 98.3+ KB


In [None]:
n = len(d)
d["Sentiment"]=[None for i in range(n)]
for i in range(n):
    
    s = TextBlob(d["Text"].iloc[i]).sentiment
    if(s[0]>=0):
        d["Sentiment"].iloc[i] = 0
    else:
        d["Sentiment"].iloc[i] = 1

In [None]:
len(d[d["Sentiment"]==0]), len(d[d["Sentiment"]==1])

(4667, 1627)

In [None]:
no = len(d[d["Sentiment"]==1])

In [None]:
t = d[d["Sentiment"]==0][:no]
s = d[d["Sentiment"]==1]

In [None]:
df = pd.concat([s,t],ignore_index="True")
len(df)

3254

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cv = CountVectorizer()
tf = TfidfVectorizer()
x1 = cv.fit_transform(df["Text"])
x2 = tf.fit_transform(df["Text"])
y = df["Sentiment"]
y=y.astype('int')

# MODEL DEVELOPMENT AND EVALUATION

# Splitting into Train and Test Data

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test, y_train, y_test = train_test_split(x1,y,test_size=0.2,random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(x2,y,test_size=0.2,random_state=42)

# Model Training and Testing

In [None]:
from sklearn import metrics

accuracy = {}
model = {}
vectorizer = {"CountVectorizer":cv,"TfidfVectorizer":tf}

# Decision Tree Classifier

### CountVectorizer

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
y_pred1 = dtc.predict(x_test)
k = "Decision Tree Classifier with CountVectorizer"
model[k] = dtc

In [None]:
a1 = metrics.accuracy_score(y_test, y_pred1)
recall = metrics.recall_score(y_test, y_pred1)
accuracy[k] = float("{0:.4f}".format(a1))
print(k)
print("Accuracy:",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Decision Tree Classifier with CountVectorizer
Accuracy: 0.9232
Recall  : 0.9054


### TfidfVectorizer

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,Y_train)
Y_pred1 = dtc.predict(X_test)
k = "Decision Tree Classifier with TfidfVectorizer"
model[k] = dtc

In [None]:
a2 = metrics.accuracy_score(Y_test, Y_pred1)
recall = metrics.recall_score(Y_test, Y_pred1)
accuracy[k] = float("{0:.4f}".format(a2))
print(k)
print("Accuracy:",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Decision Tree Classifier with TfidfVectorizer
Accuracy: 0.9094
Recall  : 0.9026


# Random Forest Classifier

### CountVectorizer

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
y_pred2 = rfc.predict(x_test)
k = "Random Forest Classifier with CountVectorizer"
model[k] = rfc

In [None]:
a3 = metrics.accuracy_score(y_test, y_pred2)
recall = metrics.recall_score(y_test, y_pred2)
accuracy[k] = float("{0:.4f}".format(a3))
print(k)
print("Accuracy:",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Random Forest Classifier with CountVectorizer
Accuracy: 0.914
Recall  : 0.8739


### TfidfVectorizer

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)
Y_pred2 = rfc.predict(X_test)
k = "Random Forest Classifier with TfidfVectorizer"
model[k] = rfc

In [None]:
a4 = metrics.accuracy_score(Y_test, Y_pred2)
recall = metrics.recall_score(Y_test, Y_pred2)
accuracy[k] = float("{0:.4f}".format(a4))
print(k)
print("Accuracy:",accuracy[k])
print("Recall  : {0:.4f}".format(recall))

Random Forest Classifier with TfidfVectorizer
Accuracy: 0.9124
Recall  : 0.8653


In [None]:
ad={}
for i in accuracy:
    ad[i] = {"Accuracy":accuracy[i]}
ad = pd.DataFrame(ad)
ad

Unnamed: 0,Decision Tree Classifier with CountVectorizer,Decision Tree Classifier with TfidfVectorizer,Random Forest Classifier with CountVectorizer,Random Forest Classifier with TfidfVectorizer
Accuracy,0.9232,0.9094,0.914,0.9124
