**Installing Dependencies and loading the data**.[https://www.kaggle.com/datasets/cosmos98/twitter-and-reddit-sentimental-analysis-dataset]

In [1]:
! pip install kaggle



In [2]:
! mkdir ~/.kaggle

In [3]:
#upload the kaggle json before executing this
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json


In [5]:
! kaggle datasets download cosmos98/twitter-and-reddit-sentimental-analysis-dataset

Downloading twitter-and-reddit-sentimental-analysis-dataset.zip to /content
  0% 0.00/10.0M [00:00<?, ?B/s] 80% 8.00M/10.0M [00:00<00:00, 82.1MB/s]
100% 10.0M/10.0M [00:00<00:00, 97.2MB/s]


In [6]:
!unzip "/content/twitter-and-reddit-sentimental-analysis-dataset.zip"


Archive:  /content/twitter-and-reddit-sentimental-analysis-dataset.zip
  inflating: Reddit_Data.csv         
  inflating: Twitter_Data.csv        


In [7]:
pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-kviat6rc
  Running command git clone -q https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-kviat6rc
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-py3-none-any.whl size=11756 sha256=0cf72d32e9344e74dd53c75c13c211219614d250864ade5c520df8675f73264f
  Stored in directory: /tmp/pip-ephem-wheel-cache-s5cqtued/wheels/0d/b3/29/bfe3deffda68980088d17b81331be6667e837ffb4a071bae82
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptalkie
Successfully installed preprocess-kgptalkie-0.1.3


Importing Packages

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
import preprocess_kgptalkie as ps
import re
import pickle
from sklearn.ensemble import RandomForestClassifier

In [9]:
l = ["Reddit_Data.csv", "Twitter_Data.csv"]
for i in l:
    df = pd.read_csv("/content/"+i+"")

In [10]:
df.isnull().any()

clean_text    True
category      True
dtype: bool

In [11]:
df = df.dropna()
df = df.replace(-1,2)

In [12]:
df.isnull().any()

clean_text    False
category      False
dtype: bool

In [13]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,2.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [14]:
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [15]:
df['clean_text'] = df['clean_text'].apply(lambda x: get_clean(x))

In [16]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,2.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp to...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [17]:
tfidf = TfidfVectorizer(max_features=5000)

In [18]:
x = df['clean_text']
y = df['category']

In [19]:
x = tfidf.fit_transform(x)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

LinearSVC

In [29]:
model_svc = LinearSVC()
model_svc.fit(X_train, y_train)
model_svc.score(X_test, y_test)

0.9383015278885685

In [22]:
y_pred = model_svc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96     11015
         1.0       0.95      0.94      0.94     14547
         2.0       0.91      0.87      0.89      7032

    accuracy                           0.94     32594
   macro avg       0.93      0.93      0.93     32594
weighted avg       0.94      0.94      0.94     32594



**XGBoost**

In [23]:
model_xgb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(X_train, y_train)
model_xgb.score(X_test, y_test)

0.8616616555194208

In [24]:
y_pred_xgb = model_xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

         0.0       0.81      0.98      0.88     11015
         1.0       0.92      0.84      0.88     14547
         2.0       0.85      0.72      0.78      7032

    accuracy                           0.86     32594
   macro avg       0.86      0.85      0.85     32594
weighted avg       0.87      0.86      0.86     32594



Random Forest

In [25]:
model_rf = RandomForestClassifier(n_estimators=100,max_depth=30, random_state=0)
model_rf.fit(X_train, y_train)
model_rf.score(X_test,y_test)

0.7160520341167086

In [26]:
y_pred_rf = model_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

         0.0       0.73      0.79      0.76     11015
         1.0       0.69      0.89      0.77     14547
         2.0       0.94      0.24      0.38      7032

    accuracy                           0.72     32594
   macro avg       0.78      0.64      0.64     32594
weighted avg       0.75      0.72      0.68     32594



In [27]:
from sklearn import model_selection
pickle.dump(model_svc,open('model_svc','wb'))
pickle.dump(model_xgb,open('model_xgb','wb'))
pickle.dump(model_rf,open('model_rf','wb'))
pickle.dump(tfidf, open("vectorizer.pickle", "wb"))

Testing

In [28]:
x = 'Hm I think modi is going to win the election too bad for congress'
x = get_clean(x)
vec = tfidf.transform([x])
vec.shape
model_rf.predict(vec)        #updating the model name to 

array([1.])