# FEATURE EXTRACTION

In [1]:
import pandas as pd
from textblob import TextBlob

In [2]:
d = pd.read_csv("processed_tweets.csv")
d = d.drop(["Unnamed: 0","index"],axis=1)
d = d.rename(columns={"0":"Text"})
d.head(10)

Unnamed: 0,Text
0,account grow difficult keep track people activ...
1,release lyric video collab song bipolar sunshi...
2,stargalaxy genuinely cant even imagine second ...
3,theres call call spark call world always
4,scary plausible
5,track trace utterly useless proof covid spread...
6,camp never lose faith team proud aggie remembe...
7,bhpv merge bhel say announce lok sabha ncbn lo...
8,check result last two weeks arround time sum n...
9,see cabinet minister speak television deep thi...


In [3]:
d.isna().sum()

Text    0
dtype: int64

In [4]:
d.dropna(inplace=True)

In [5]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6294 entries, 0 to 6293
Data columns (total 1 columns):
Text    6294 non-null object
dtypes: object(1)
memory usage: 98.3+ KB


In [6]:
n = len(d)
d["Sentiment"]=[None for i in range(n)]
for i in range(n):
    
    s = TextBlob(d["Text"].iloc[i]).sentiment
    if(s[0]>=0):
        d["Sentiment"].iloc[i] = 0
    else:
        d["Sentiment"].iloc[i] = 1

In [7]:
len(d[d["Sentiment"]==0]), len(d[d["Sentiment"]==1])

(4667, 1627)

In [8]:
no = len(d[d["Sentiment"]==1])

In [9]:
t = d[d["Sentiment"]==0][:no]
s = d[d["Sentiment"]==1]

In [10]:
df = pd.concat([s,t],ignore_index="True")
len(df)

3254

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cv = CountVectorizer()
tf = TfidfVectorizer()
x1 = cv.fit_transform(df["Text"])
x2 = tf.fit_transform(df["Text"])
y = df["Sentiment"]
y=y.astype('int')

# MODEL DEVELOPMENT AND EVALUATION

# Splitting into Train and Test Data

In [12]:
from sklearn.model_selection import train_test_split

x_train,x_test, y_train, y_test = train_test_split(x1,y,test_size=0.2,random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(x2,y,test_size=0.2,random_state=42)

In [13]:
from sklearn import metrics

accuracy = {}
model = {}
vectorizer = {"CountVectorizer":cv,"TfidfVectorizer":tf}

# K Nearest Neighbors

## CountVectorizer

In [14]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier()
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
x = "KNeighborsClassifier with CountVectorizer"
model[x]=classifier

In [15]:
a1 = metrics.accuracy_score(y_test, y_pred)
r1 = metrics.recall_score(y_test, y_pred)
accuracy[x] = float("{0:.4f}".format(a1))
print(x)
print("Accuracy: ",accuracy[x])
print("Recall  : {0:.4f}".format(r1))

KNeighborsClassifier with CountVectorizer
Accuracy:  0.7158
Recall  : 0.5129


## TfidfVectorizer

In [16]:
classifier= KNeighborsClassifier()
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)
x = "KNeighborsClassifier with TfidfVectorizer"
model[x]=classifier

In [17]:
a2 = metrics.accuracy_score(Y_test, Y_pred)
r2 = metrics.recall_score(Y_test, Y_pred)
accuracy[x] = float("{0:.4f}".format(a2))
print(x)
print("Accuracy: ",accuracy[x])
print("Recall  : {0:.4f}".format(r2))

KNeighborsClassifier with TfidfVectorizer
Accuracy:  0.7481
Recall  : 0.7393


In [18]:
ad={}
for i in accuracy:
    ad[i] = {"Accuracy":accuracy[i]}
ad = pd.DataFrame(ad)
ad

Unnamed: 0,KNeighborsClassifier with CountVectorizer,KNeighborsClassifier with TfidfVectorizer
Accuracy,0.7158,0.7481
