<a href="https://colab.research.google.com/github/Arsh901/Sentiment-Analysis-of-Twitter-data/blob/main/Twitter_Sentiment_Analysis_using_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing Kaggle Library. Generally, in colab this library is pre-installed
! pip install kaggle



In [1]:
#configure the path of kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
# API to get the dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:03<00:00, 33.6MB/s]
100% 80.9M/80.9M [00:03<00:00, 24.5MB/s]


In [4]:
# Extraxt the zip file
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'
with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print("Dataset extracted")

Dataset extracted


In [7]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import re           # Regular expression
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [8]:
import nltk
nltk.download('stopwords')   # Stopwords that need to be removed as they don't add any meaning to the data


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
# Print stopwords in English.
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Data Preprocessing

In [10]:
# Load the dataset
data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding = 'ISO_8859-1')

In [13]:
# Checking the number of rows and columns
data.shape

(1599999, 6)

In [15]:
# Print some first rows of the dataset
data.head(5)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [16]:
# As the dataset doesn't show column names, we will add the names to these columns
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding = 'ISO_8859-1', names = column_names)

In [17]:
# Using head to see if the column names have been added
data.head(5)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [18]:
data.shape  # We can see the number of rows have increased by 1

(1600000, 6)

In [19]:
# Counting the number of missing values
data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [21]:
# Checking distribution of target column
""" In the dataset, target value can be either 0 or 4. 0 is for negative review and 4 for +ve"""
data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


0 ---> For negative tweet
4 ---> For positive tweet

**Stemming**
: Refers to process of reducing a word to root

---

Ex - Acting - Act and Actor - Act

In [None]:
port_stem = PorterStemmer()

In [27]:
# Stem (Used in 4th line) function reduces a word to its root word.
def stemming(content):
  stemmer = nltk.SnowballStemmer('english')
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)           # Removes all content that isn't alphabat
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()                     # Load all words in list
  stemmed_content = [stemmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [28]:
data['stemmed_content'] = data['text'].apply(stemming)


In [29]:
data.head(5)    # Checking the stemmed data

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [30]:
# Separate label and data
X = data['stemmed_content'].values
Y = data['target'].values

In [31]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [32]:
print(Y)

[0 0 0 ... 4 4 4]


Splitting the data into training and testing data

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2, stratify = Y)

Converting Text data to numerical data

In [35]:
vector = TfidfVectorizer()
X_train = vector.fit_transform(X_train)    # Vector will give numbers to categorical data
X_test = vector.transform(X_test)


In [36]:
print(X_train)

  (0, 437327)	0.27260257574575814
  (0, 355060)	0.3588073039837249
  (0, 185373)	0.5277651743808424
  (0, 109427)	0.37537891073862373
  (0, 235336)	0.4199661032868353
  (0, 443683)	0.4484732104353878
  (1, 160815)	1.0
  (2, 109427)	0.4591297818076123
  (2, 124625)	0.18919764718444246
  (2, 407860)	0.18709335245289355
  (2, 129525)	0.290741873827107
  (2, 406955)	0.3210545358833526
  (2, 434167)	0.3296595291954385
  (2, 78021)	0.3128407499881532
  (2, 444047)	0.334859905461785
  (2, 267072)	0.24122380810429364
  (2, 409709)	0.15169279546260175
  (2, 178266)	0.16190098117925839
  (2, 150875)	0.188038471261484
  (2, 132432)	0.20289711973764335
  (2, 288880)	0.16786946511607387
  (3, 406955)	0.2903341520952007
  (3, 158896)	0.44548180757177724
  (3, 151928)	0.2785925026963104
  (3, 56501)	0.5201078827619504
  :	:
  (1279996, 318754)	0.21254905106589583
  (1279996, 434622)	0.2718971435147747
  (1279996, 390649)	0.2206495629250393
  (1279996, 373661)	0.3521284267827432
  (1279996, 238368)	0.

In [40]:
print(X_test)

  (0, 15092)	0.17193814618360403
  (0, 31142)	0.1624799467501681
  (0, 67883)	0.26800821447405354
  (0, 106189)	0.36556058591397544
  (0, 132485)	0.2552591390766229
  (0, 138283)	0.2368868663023513
  (0, 171577)	0.28058629179957767
  (0, 271372)	0.4535737902031523
  (0, 279470)	0.17816398599905964
  (0, 388873)	0.21985442082832507
  (0, 399451)	0.34911019926746645
  (0, 409709)	0.3143099385716381
  (0, 421601)	0.17915660466637195
  (1, 6454)	0.30734805789432307
  (1, 15092)	0.21106122917392603
  (1, 145532)	0.5753277895857637
  (1, 217803)	0.4026637942995928
  (1, 257098)	0.2875482540971773
  (1, 348650)	0.47398136148831926
  (1, 366714)	0.24598333818805238
  (2, 22509)	0.3532582957477176
  (2, 34372)	0.37916255084357414
  (2, 89555)	0.36340369428387626
  (2, 183491)	0.5892069252021465
  (2, 257155)	0.2564939661498776
  :	:
  (319994, 444411)	0.2782284096210794
  (319995, 107987)	0.3339930120683257
  (319995, 109501)	0.3020892095396257
  (319995, 155667)	0.2770678807048461
  (319995, 2

# Training the model

In [41]:
model = LogisticRegression(max_iter = 1000)   # max_iter shows how many times model will go thru the data

In [42]:
model.fit(X_train, Y_train)

# Model Evaluation

Accuracy score

In [43]:
# Accuracy score on training data
X_train_pred = model.predict(X_train)
training_accuracy = accuracy_score(Y_train, X_train_pred)
print('Accuracy:' , training_accuracy)

Accuracy: 0.7942765625


In [45]:
# Accuracy score for test data
X_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(Y_test, X_test_pred)
print('Accuracy:' , test_accuracy)

Accuracy: 0.776746875


Saving the model

In [46]:
import pickle

In [47]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

## Testing the model

In [59]:
X_new = X_test[191]
print(Y_test[200])

4


In [64]:
X_new = X_test[3]
prediction = model.predict(X_new)
print(prediction)

if prediction == 0:
  print("Negative Tweet")
else:
  print("Positive tweet")


[0]
Negative Tweet
