<a href="https://colab.research.google.com/github/Annamjohn/Twitter-Sentiment-Analysis/blob/main/Sentiment%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Install kaggle library
! pip install kaggle



Upload your kaggle.json file

In [5]:
# configure the path of kaggle.json file

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
# fetch and import Twitter Sentiment dataset using API
!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis

Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis
License(s): CC0-1.0
Downloading twitter-entity-sentiment-analysis.zip to /content
  0% 0.00/1.99M [00:00<?, ?B/s]
100% 1.99M/1.99M [00:00<00:00, 92.0MB/s]


In [7]:
#Extract the compressed dataset

from zipfile import ZipFile
dataset= '/content/twitter-entity-sentiment-analysis.zip'

with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print("The dataset has been extracted")

The dataset has been extracted


In [8]:
#import the dependencies
import numpy as np
import pandas as pd
import re #pattern matching, search through data
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
import nltk
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Processing

In [36]:
# # loading data from csv file to pd DF
cols=['tweetid', 'entity', 'target', 'content']

data0 = pd.read_csv("/content/twitter_training.csv",names=cols)
data1 = pd.read_csv("/content/twitter_validation.csv",names=cols)

twitter_data = pd.concat([data0,data1], ignore_index=True)

In [37]:
#no. of rows and columns
twitter_data.shape

(75682, 4)

In [38]:
twitter_data.head()

Unnamed: 0,tweetid,entity,target,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [40]:
twitter_data.isnull().sum() # sum of no. of null vals in each column



tweetid      0
entity       0
target       0
content    686
dtype: int64

In [43]:
twitter_data['content'] = twitter_data['content'].fillna('')

In [44]:
twitter_data.isnull().sum()

tweetid    0
entity     0
target     0
content    0
dtype: int64

In [45]:
twitter_data['target'].value_counts()

target
Negative      22808
Positive      21109
Neutral       18603
Irrelevant    13162
Name: count, dtype: int64

**Stemming**

In [47]:
port_stem=PorterStemmer()

In [48]:
def stemming(content):

    # Remove unwanted patterns
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content) # ^ =>not i.e. replace all non-alphabetic characters in content with a space
  stemmed_content= stemmed_content.lower()
  stemmed_content= stemmed_content.split() # Split the string into a list of words based on whitespace
  stemmed_content= [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
  stemmed_content= ' '.join(stemmed_content) # Joins the list of stemmed words back into a single string for each tweet, with words separated by spaces.

  return stemmed_content

In [49]:
twitter_data['stemmed_content'] = twitter_data['content'].apply(stemming)

In [50]:
twitter_data.head()

Unnamed: 0,tweetid,entity,target,content,stemmed_content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im get borderland murder


In [51]:
print(twitter_data['stemmed_content'])

0                                 im get borderland murder
1                                         come border kill
2                                   im get borderland kill
3                                im come borderland murder
4                                 im get borderland murder
                               ...                        
75677    toronto art cultur capit canada wonder want st...
75678    actual good move tot bring viewer one peopl go...
75679    today suck time drink wine n play borderland s...
75680            bought fraction microsoft today small win
75681    johnson johnson stop sell talc babi powder u c...
Name: stemmed_content, Length: 75682, dtype: object


In [52]:
print(twitter_data['target'])

0          Positive
1          Positive
2          Positive
3          Positive
4          Positive
            ...    
75677    Irrelevant
75678    Irrelevant
75679      Positive
75680      Positive
75681       Neutral
Name: target, Length: 75682, dtype: object


In [53]:
# seperate data and label

X= twitter_data['stemmed_content'].values
Y=twitter_data['target'].values

In [54]:
print(X) # 1st 3 and last 3

['im get borderland murder' 'come border kill' 'im get borderland kill'
 ...
 'today suck time drink wine n play borderland sun come hate day tomorrow'
 'bought fraction microsoft today small win'
 'johnson johnson stop sell talc babi powder u canada j mp e ytdv reuter http co dsautgb p']


In [55]:
print(Y)

['Positive' 'Positive' 'Positive' ... 'Positive' 'Positive' 'Neutral']


In [56]:
#Split dataset into train and test set
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.05, stratify= Y, random_state=42)

In [57]:
print(X.shape, X_train.shape, X_test.shape) # 80-20

(75682,) (71897,) (3785,)


In [58]:
print(X_train)

['n k ld k callm get guy ban pic wikipedia org fvhuk tayp'
 'instal undead nightmar xbox e right realis like gb like base game keep forget basic stand alon game expans pack'
 'want friend know read msg app perfect bit ly qemcad link ft ly hcopld http co l pew ab'
 ... 'imma act like see'
 'work someth sinc morn glitch slow draft sound insid someth done end'
 'great amazon fun chanc win could also win particip amazon daili quiz pm pic twitter com qxjihkv ke']


In [59]:
print(X_test)

['red dead redempt amzn xdxrpd http co k n dfwtk'
 'time despit fact current million peopl live poverti line access health servic access health care access health care'
 'hey rhandlerr fuck good think mask fuck snag pic walk around talk associ custom without plexiglass separ howard lane locat austin tx pic twitter com z x amaix'
 ...
 'aye amazon realli run shit lmfao bulli congress tlkn bout nigga year make rule keep wana afraid jeff bezo lol nigga differ af'
 'watch part'
 'real jackboy front toni scott stolen slogan sell shit merch oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo']


In [60]:
#Convert textual data to numerical data
vectorizer=TfidfVectorizer() # assigns some importance to each word #checks if each word corresponds to +ve or -ve tweets

X_train=vectorizer.fit_transform(X_train) # understands vocab assigns score to each word
X_test=vectorizer.transform(X_test) # ensures test data is transformed using the same vocabulary and IDF values as training data. Ensures consistency and prevents overfitting.

In [61]:
print(X_train)

  (0, 18925)	0.43643017903739983
  (0, 7088)	0.43643017903739983
  (0, 13761)	0.2578943485335108
  (0, 21490)	0.2836522883839648
  (0, 14465)	0.16173854206749264
  (0, 1370)	0.2042230509004866
  (0, 8048)	0.19979515081961197
  (0, 7392)	0.1543006583903037
  (0, 2607)	0.46554222188899497
  (0, 10835)	0.3518046554531456
  (1, 14003)	0.23853302311035593
  (1, 6015)	0.26694303312733547
  (1, 482)	0.24400893276916705
  (1, 18242)	0.251399282465922
  (1, 1441)	0.23885139784858622
  (1, 6745)	0.24833820756448757
  (1, 10309)	0.1853038075765972
  (1, 7154)	0.21234570816860907
  (1, 1433)	0.21552144162089903
  (1, 7278)	0.23728757735248812
  (1, 11037)	0.26359585856363676
  (1, 15709)	0.286364898860486
  (1, 16236)	0.17818711434578763
  (1, 21863)	0.16418987937075957
  (1, 13073)	0.28015713166692524
  :	:
  (71895, 7576)	0.3106088105066404
  (71895, 17704)	0.3094818268381356
  (71895, 17977)	0.2632303235309792
  (71895, 5011)	0.24528887502227587
  (71895, 5662)	0.24466007514981364
  (71895, 175

In [62]:
print(X_test)

  (0, 21911)	0.5300998124039256
  (0, 15801)	0.25687858885375325
  (0, 15788)	0.2411500581321397
  (0, 8877)	0.22684415755403708
  (0, 4653)	0.5394007797045655
  (0, 4364)	0.23707128948423226
  (0, 3369)	0.22155847279260318
  (0, 595)	0.3836037054233076
  (1, 19465)	0.11930780242267115
  (1, 17137)	0.16059310178758623
  (1, 14819)	0.20563189957799843
  (1, 14299)	0.12722814661082876
  (1, 12173)	0.17915550101547342
  (1, 11111)	0.14351544395509194
  (1, 11072)	0.1752261160331894
  (1, 8357)	0.5554677672893699
  (1, 6125)	0.17338709797212343
  (1, 4597)	0.19217653873723042
  (1, 4082)	0.17372323625206032
  (1, 2718)	0.33087086483403816
  (1, 94)	0.5503742511199581
  (2, 21561)	0.16284575209974425
  (2, 21123)	0.2008198361052675
  (2, 20045)	0.2673803477912168
  (2, 20030)	0.1046393436377666
  :	:
  (3782, 11229)	0.1527358718278577
  (3782, 11164)	0.22244154133842228
  (3782, 10309)	0.14544202419879274
  (3782, 9820)	0.21869881080474496
  (3782, 4706)	0.17110139917221956
  (3782, 3624)	0

In [63]:
# Train ML model- Logistic Regression

model=LogisticRegression(max_iter=1000)

In [64]:
model.fit(X_train,Y_train)

In [65]:
#Model Evaluation on training data

X_train_pred= model.predict(X_train)
training_data_accuracy= accuracy_score(Y_train, X_train_pred)

In [66]:
print('Accuracy of training data:' ,training_data_accuracy)

Accuracy of training data: 0.8362796778725121


In [67]:
#Model Evaluation on test data

X_test_pred= model.predict(X_test)
test_data_accuracy= accuracy_score(Y_test, X_test_pred)

In [68]:
print('Accuracy of testing data:' ,test_data_accuracy)


Accuracy of testing data: 0.7968295904887714


# Save the Trained Model

In [69]:
import pickle

In [70]:
filename= 'trained_model.sav'
pickle.dump(model, open(filename, 'wb')) #wb-> writing in binary format

In [71]:
#Using saved model for future predictions
loaded_model= pickle.load(open('/content/trained_model.sav', 'rb'))

In [78]:
X_new=X_test[700]
print(Y_test[700])

prediction= loaded_model.predict(X_new)
print(prediction)



Neutral
['Neutral']
