In [None]:
! kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:02<00:00, 41.7MB/s]
100% 80.9M/80.9M [00:02<00:00, 32.0MB/s]


In [None]:
from zipfile import ZipFile

dataset = 'sentiment140.zip'

with ZipFile(dataset, 'r') as zipped:
    zipped.extractall()
    print('Extracted all files from', dataset)

Extracted all files from sentiment140.zip


In [None]:
import pandas as pd
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # and, the, a
from nltk.stem.porter import PorterStemmer # running - > run

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
twitter_sentiment_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

print('Number of rows', twitter_sentiment_df.shape[0])
print('Number of columns', twitter_sentiment_df.shape[1])

twitter_sentiment_df.head()

Number of rows 1599999
Number of columns 6


Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [None]:
columns = ['target', 'id', 'date', 'flag', 'user', 'text']

twitter_sentiment_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names=columns)

print('Number of rows', twitter_sentiment_df.shape[0])
print('Number of columns', twitter_sentiment_df.shape[1])

twitter_sentiment_df.head()

Number of rows 1600000
Number of columns 6


Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
twitter_sentiment_df['target']=twitter_sentiment_df['target'].replace(4,1)
twitter_sentiment_df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
if twitter_sentiment_df.isnull().sum().sum()==0:
    print('No null values')
else:
    print('Null values')

No null values


In [None]:
positive_count = 0
negative_count = 0
for value in twitter_sentiment_df['target']:
    if value==0:
      negative_count += 1
    elif value==1:
      positive_count +=1
    else:
      print("Valoare necunoscuta")

print("Number of positive tweets:", positive_count)
print("Number of negative tweets:", negative_count)

Number of positive tweets: 800000
Number of negative tweets: 800000


In [None]:
english_stopwords = set(stopwords.words('english'))

def stem_text(text):
  stemmed_text=re.sub('[^A-Za-z]',' ',text)
  stemmed_text=stemmed_text.lower()

  stemmed_text = stemmed_text.split()

  stemmed_tokens=[]
  for token in stemmed_text:
    if token not in english_stopwords:
      stemmed_token= PorterStemmer().stem(token)
      stemmed_tokens.append(stemmed_token)

  stemmed_text=stemmed_tokens
  stemmed_text=' '.join(stemmed_text)
  return stemmed_text

In [None]:
twitter_sentiment_df['stemmed_tweet']=twitter_sentiment_df['text'].apply(stem_text)

In [None]:
twitter_sentiment_df[['text','stemmed_tweet']].head()

Unnamed: 0,text,stemmed_tweet
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [None]:
X = twitter_sentiment_df['stemmed_tweet'].values
Y = twitter_sentiment_df['target'].values

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [None]:
vectorizer = TfidfVectorizer()

X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)

In [None]:
X_train_predictions = model.predict(X_train)
training_accuracy = accuracy_score(Y_train, X_train_predictions)

In [None]:
print('Accuracy score on training data: {:.2f}%'.format(training_accuracy*100))

Accuracy score on training data: 79.87%


In [None]:
X_test_predictions=model.predict(X_test)
testing_accuracy=accuracy_score(Y_test,X_test_predictions)

In [None]:
print('Accuracy score on testing data: {:.2f}%'.format(testing_accuracy*100))

Accuracy score on testing data: 77.67%
