In [None]:
!pip install kaggle



In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [None]:
# API to fetch the dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 84% 68.0M/80.9M [00:00<00:00, 178MB/s]
100% 80.9M/80.9M [00:00<00:00, 169MB/s]


In [None]:
# Extracting the compressed dataset

from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


Importing the Dependencies

In [None]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Processing

In [None]:
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',names=['target','id','date','flag','user','text'])

In [None]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
df = df.drop(['id','date','flag','user'],axis=1)

In [None]:
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
df.shape

(1600000, 2)

In [None]:
df.isnull().sum()

Unnamed: 0,0
target,0
text,0


In [None]:
# Checking the distribution of the labels

df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [None]:
# The data is equally distributed


-  0 means negative tweet
-  4 means positive

In [None]:
df = df.replace({'target':{4:1}})

In [None]:
df.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


In [None]:
new_df = df.sample(200000)

### Stemming

- Stemming is the process of reducing a word to its root word

In [None]:
new_df.shape

(200000, 2)

In [None]:
new_df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,100175
1,99825


In [None]:
new_df.head()

Unnamed: 0,target,text
1121818,1,SHAUN SMITH TO WIN :O X
316024,0,Ugh. Not feeling so good. This ac unit is so c...
1066280,1,@emilypayne Thatls megaHot
985786,1,Got up way to late today ... now catching up o...
1024651,1,@DeryaMetin hahaha as long as you wash my bra ...


In [None]:
port_stem = PorterStemmer()

In [None]:
 def stemming(content):

  # removing all the things which are not alphabets
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  # converting to lower case
  stemmed_content = stemmed_content.lower()
  # tokenizing the content
  stemmed_content = stemmed_content.split()
  # Stemming
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  # joining them on spaces
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content


In [None]:
new_df['text'] = new_df['text'].apply(stemming)

In [None]:
new_df.head()

Unnamed: 0,target,text
1121818,1,shaun smith win x
316024,0,ugh feel good ac unit cold stomach act like ju...
1066280,1,emilypayn thatl megahot
985786,1,got way late today catch news
1024651,1,deryametin hahaha long wash bra wear mean take...


In [None]:
new_df.shape

(200000, 2)

In [None]:
X = new_df['text'].values
y = new_df['target'].values

In [None]:
X

array(['shaun smith win x',
       'ugh feel good ac unit cold stomach act like jump bean',
       'emilypayn thatl megahot', ...,
       'nanti sarrmm take look facebook group http tinyurl com c c r come x',
       'willclarkfan yeah love well part reason',
       'dammit hd adapt blew new movi night'], dtype=object)

In [None]:
y

array([1, 0, 1, ..., 1, 1, 0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=2)


# stratify tells that we want equal distribution of positive and negative classes in the y_train and y_test

In [None]:
X_train

array(['huntyhunt threadless unlimit reprint x infinit reprint reprint',
       'jaxraghibtrail good afternoon haha',
       'zlynnb jlamond hope know forev', ...,
       'watch tenni earlier day come roger',
       'http tinyurl com nfcqpd umm lindsay lohan bold like',
       'yay four day weekend'], dtype=object)

In [None]:
y_train

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(200000,) (160000,) (40000,)


In [None]:
# Converting the textual data to numerical data

In [None]:
vectorizer = TfidfVectorizer()


X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
print(X_train_vectorized)

  (0, 41600)	0.2876333171358223
  (0, 96916)	0.23494351327168053
  (0, 101554)	0.23896554024173955
  (0, 80083)	0.8628999514074669
  (0, 43441)	0.24569852438093318
  (1, 45720)	0.7952495729362994
  (1, 36267)	0.25145510936118576
  (1, 1348)	0.44787613004338567
  (1, 37947)	0.3221108766529748
  (2, 109768)	0.6179185261467126
  (2, 47451)	0.6179185261467126
  (2, 40916)	0.2293856204014058
  (2, 52259)	0.22088758126219876
  (2, 32734)	0.36734766051595674
  (3, 72507)	0.4757217713403674
  (3, 67929)	0.21206732932095745
  (3, 109733)	0.4488909423508799
  (3, 64468)	0.242049832910715
  (3, 98256)	0.197879116946091
  (3, 14523)	0.4600267425182551
  (3, 10114)	0.4331959135287676
  (3, 84879)	0.1737400245117646
  (4, 52259)	0.20966231557697879
  (4, 31179)	0.21266665313620733
  (4, 79187)	0.2167813739992337
  :	:
  (159995, 86071)	0.44611767198303465
  (159996, 100327)	0.31488126416355855
  (159996, 67951)	0.30840880183632835
  (159996, 32525)	0.33629363850809496
  (159996, 89472)	0.33524102639

In [None]:
model = LogisticRegression(max_iter = 1000)

In [None]:
model.fit(X_train_vectorized,y_train)

In [None]:
y_pred = model.predict(X_test_vectorized)

In [None]:
y_pred

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
accuracy_score(y_test,y_pred)

0.7666

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100,max_depth=12)
rf.fit(X_train_vectorized,y_train)
y_pred_rf = rf.predict(X_test_vectorized)
accuracy_score(y_test,y_pred_rf)

0.7093

In [None]:
from sklearn.model_selection import GridSearchCV


params = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
grid = GridSearchCV(model, param_grid=params, cv=5)
grid.fit(X_train_vectorized, y_train)
print(grid.best_params_, grid.best_score_)

{'C': 1, 'solver': 'liblinear'} 0.7643500000000001


In [None]:
X_test[5]

'bradske hey cuz read tweet away listen hope ur well'

In [None]:
y_test[5]

1

In [None]:
model.predict(X_test_vectorized[5])

array([1])

In [None]:
import pickle

with open('model.pkl','wb') as f:
  pickle.dump(model,f)

In [None]:
with open('vectorizer.pkl','wb') as f:
  pickle.dump(vectorizer,f)

In [62]:
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [61]:
with open('vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

In [63]:
text = 'Life is good! I’m enjoying every moment of this beautiful journey. #happy'

In [64]:
stemmed_text = stemming(text)

In [65]:
stemmed_text

'life good enjoy everi moment beauti journey happi'

In [67]:
vectorized_text = loaded_vectorizer.transform([stemmed_text])

In [68]:
loaded_model.predict(vectorized_text)

array([1])

In [69]:
def make_prediction(text):
  stemmed_text = stemming(text)
  vectorized_text = loaded_vectorizer.transform([stemmed_text])
  prediction = loaded_model.predict(vectorized_text)

  if prediction[0] == 0:
    return 'Negative'
  else:
    return 'Positive'

In [70]:
text = "I’m completely heartbroken. Nothing makes sense anymore. #depressed"

In [71]:
make_prediction(text)

'Negative'

In [72]:
text_2 = "Feeling so sick and tired of dealing with everything. I just need a break from life."

In [73]:
make_prediction(text_2)

'Negative'

In [74]:
text_3 = "I’m so excited for the weekend! Can’t wait to relax and have fun."

In [75]:
make_prediction(text_3)

'Positive'