<a href="https://colab.research.google.com/github/Annamjohn/Twitter-Sentiment-Analysis/blob/main/Sentiment%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install kaggle library
! pip install kaggle



Upload your kaggle.json file

In [3]:
# configure the path of kaggle.json file

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
# fetch and import Twitter Sentiment dataset using API
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 96% 78.0M/80.9M [00:03<00:00, 32.7MB/s]
100% 80.9M/80.9M [00:03<00:00, 24.9MB/s]


In [5]:
#Extract the compressed dataset

from zipfile import ZipFile
dataset= '/content/sentiment140.zip'

with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print("The dataset has been extracted")

The dataset has been extracted


In [6]:
#import the dependencies
import numpy as np
import pandas as pd
import re #pattern matching, search through data
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
import nltk
nltk.download('stopwords')
print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Processing

In [8]:
# loading data from csv file to pd DF
twitter_data=pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding= 'ISO-8859-1') #characters in your CSV file are read correctly, avoiding errors or misinterpretation of special characters. Encoding: To convert characters into bytes and vice versa.

In [9]:
#no. of rows and columns
twitter_data.shape

(1599999, 6)

In [10]:
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [11]:
column_names=['target','id','date','flag','user','text']
twitter_data=pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names= column_names, encoding= 'ISO-8859-1')

In [12]:
twitter_data.shape

(1600000, 6)

In [13]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [14]:
twitter_data.isnull().sum() # sum of no. of null vals in each column

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [15]:
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [16]:
#converting 4 to 1
twitter_data.replace({'target': {4:1}}, inplace=True) # inplace=True => changes will be applied directly to the DataFrame twitter_data without creating a new DataFrame.
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

0=> Negative Tweet

1=> Positive Tweet

**Stemming**

In [17]:
port_stem=PorterStemmer()

In [18]:
def stemming(content):
  combined_pattern = r'http[s]?://\S+|\S+@\S+|@\w+|[^a-zA-Z\s]'

    # Remove unwanted patterns
  stemmed_content = re.sub(combined_pattern, ' ', content) # ^ =>not i.e. replace all non-alphabetic characters in content with a space
  stemmed_content= stemmed_content.lower()
  stemmed_content= stemmed_content.split() # Split the string into a list of words based on whitespace
  stemmed_content= [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
  stemmed_content= ' '.join(stemmed_content) # Joins the list of stemmed words back into a single string for each tweet, with words separated by spaces.

  return stemmed_content

In [19]:
# def stemming(content):

#     # Remove unwanted patterns
#   stemmed_content = re.sub('[^a-zA-Z]', ' ', content) # ^ =>not i.e. replace all non-alphabetic characters in content with a space
#   stemmed_content= stemmed_content.lower()
#   stemmed_content= stemmed_content.split() # Split the string into a list of words based on whitespace
#   stemmed_content= [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
#   stemmed_content= ' '.join(stemmed_content) # Joins the list of stemmed words back into a single string for each tweet, with words separated by spaces.

#   return stemmed_content

In [20]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [28]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww bummer shoulda got david carr third day
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,dive mani time ball manag save rest go bound
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",behav mad see


In [22]:
print(twitter_data['stemmed_content'])

0               awww bummer shoulda got david carr third day
1          upset updat facebook text might cri result sch...
2               dive mani time ball manag save rest go bound
3                            whole bodi feel itchi like fire
4                                              behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996              thewdb com cool hear old walt interview
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999                                 happi charitytuesday
Name: stemmed_content, Length: 1600000, dtype: object


In [23]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [32]:
# seperate data and label

X= twitter_data['stemmed_content'].values
Y=twitter_data['target'].values

In [35]:
print(X) # 1st 3 and last 3

['awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday']


In [36]:
print(Y)

[0 0 0 ... 1 1 1]


In [63]:
#Split dataset into train and test set
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.05, stratify= Y, random_state=2)

In [64]:
print(X.shape, X_train.shape, X_test.shape) # 80-20

(1600000,) (1520000,) (80000,)


In [65]:
print(X_train)

['woke' 'finish studio new song myspac tomorrow may th'
 'hey love good hear voic miss much hope fun meet xoxo' ...
 'want say hi follow tweeti day'
 'six pack juli ab beer even like beer punish'
 'love wake folger bad voic deeper']


In [66]:
print(X_test)

['hope wait anoth christen parti see u'
 'ohai petra mr sexi bass delet twitter life hollow shell'
 'ate dinner yum work go boyfrannn' ...
 'jason mraz anoop alex love child right sad stori day'
 'tonight hilar fun long time also thunder scare dog scare'
 'phone broken txt bleh need new phone anyway']


In [67]:
#Convert textual data to numerical data
vectorizer=TfidfVectorizer() # assigns some importance to each word #checks if each word corresponds to +ve or -ve tweets

X_train=vectorizer.fit_transform(X_train) # understands vocab assigns score to each word
X_test=vectorizer.transform(X_test) # ensures test data is transformed using the same vocabulary and IDF values as training data. Ensures consistency and prevents overfitting.

In [68]:
print(X_train)

  (0, 205181)	1.0
  (1, 182108)	0.3482657480621285
  (1, 113898)	0.35654105159936617
  (1, 186299)	0.27536344665616547
  (1, 123656)	0.4170653661808539
  (1, 169581)	0.3302465226105228
  (1, 126579)	0.2646968176940179
  (1, 174937)	0.4595487670632621
  (1, 62292)	0.333929123507017
  (2, 207925)	0.41573165351182517
  (2, 115184)	0.3355073177707557
  (2, 67103)	0.2756693127459844
  (2, 83196)	0.25016282936557266
  (2, 122010)	0.2609106046789555
  (2, 118310)	0.23785015615498517
  (2, 198390)	0.40393497726216165
  (2, 79148)	0.32641295786817537
  (2, 72376)	0.21219064972691573
  (2, 108428)	0.21921460835484427
  (2, 80630)	0.2976427037908759
  (3, 75901)	0.40055468819971907
  (3, 55462)	0.32304154839962157
  (3, 188342)	0.6748254335428774
  (3, 31348)	0.5289713924329635
  (4, 147117)	0.6607332931021982
  :	:
  (1519995, 186299)	0.22893922061729496
  (1519996, 12167)	0.5449981126944131
  (1519996, 76977)	0.5299854393615543
  (1519996, 183848)	0.4135887005181912
  (1519996, 205181)	0.501035

In [69]:
print(X_test)

  (0, 199523)	0.304427592034166
  (0, 160539)	0.2765360423733405
  (0, 137310)	0.3716866393494849
  (0, 83196)	0.2841091124397432
  (0, 33870)	0.6974379415217449
  (0, 7758)	0.35434874144050216
  (1, 191198)	0.16450539504384182
  (1, 162984)	0.33339714492766404
  (1, 161817)	0.26749413937927063
  (1, 139489)	0.4346690321525171
  (1, 131632)	0.41725318814784124
  (1, 121519)	0.24560331352859055
  (1, 105347)	0.19114265204485772
  (1, 82420)	0.3962793437949972
  (1, 45061)	0.2674730614677787
  (1, 15417)	0.3189955417319289
  (2, 211176)	0.4051973500718169
  (2, 206035)	0.20205716842825075
  (2, 71783)	0.17933067044263976
  (2, 47354)	0.3247841921194308
  (2, 23645)	0.7222494538828408
  (2, 11047)	0.36837409482127736
  (3, 211315)	0.4481106494920969
  (3, 166517)	0.5181373027954738
  (3, 139019)	0.4174607640969708
  :	:
  (79997, 156806)	0.20991086919782048
  (79997, 153618)	0.2121273207602883
  (79997, 121530)	0.4359177342906387
  (79997, 108428)	0.16869561104002614
  (79997, 92306)	0.36

In [70]:
# Train ML model- Logistic Regression

model=LogisticRegression(max_iter=1000)

In [71]:
model.fit(X_train,Y_train)

In [72]:
#Model Evaluation on training data

X_train_pred= model.predict(X_train)
training_data_accuracy= accuracy_score(Y_train, X_train_pred)

In [73]:
print('Accuracy of training data:' ,training_data_accuracy)

Accuracy of training data: 0.7919736842105263


In [74]:
#Model Evaluation on test data

X_test_pred= model.predict(X_test)
test_data_accuracy= accuracy_score(Y_test, X_test_pred)

In [75]:
print('Accuracy of testing data:' ,test_data_accuracy)


Accuracy of testing data: 0.77265


# Save the Trained Model

In [76]:
import pickle

In [77]:
filename= 'trained_model.sav'
pickle.dump(model, open(filename, 'wb')) #wb-> writing in binary format

In [78]:
#Using saved model for future predictions
loaded_model= pickle.load(open('/content/trained_model.sav', 'rb'))

In [79]:
X_new=X_test[200]
print(Y_test[200])

prediction= loaded_model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print("Negative tweet")
else:
  print("Positive Tweet")

0
[0]
Negative tweet
