In [2]:
!pip install kaggle




In [13]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [14]:
!kaggle --version


Kaggle API 1.7.4.5


In [15]:
!kaggle datasets download -d kazanova/sentiment140
# it is used to download .zip dataset from kaggle

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
  0% 0.00/80.9M [00:00<?, ?B/s]
100% 80.9M/80.9M [00:00<00:00, 1.50GB/s]


In [16]:
!unzip sentiment140.zip  # extracting the compressed dataset


Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [74]:
#importing the dependecies
import numpy as np
import pandas as pd
import re            # used for pattern matching
from nltk.corpus import stopwords  # we will use it to remove stopwords
from nltk.stem.porter import PorterStemmer  # reduce the words to asimple formate
from sklearn.feature_extraction.text import TfidfVectorizer  # using this we will convert text to vectors(numerial)
from sklearn.model_selection import train_test_split  # to split the data in train and test
from sklearn.linear_model import LogisticRegression # mode for learning
from sklearn.metrics import accuracy_score  # to calculate accuracy
from nltk.stem import WordNetLemmatizer  # used for lemmatization improvement
from sklearn.metrics import classification_report  # improvemtn

In [95]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrishmishra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shrishmishra/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/shrishmishra/nltk_data...


True

In [96]:
# printing stopwords in english
print(stopwords.words('english'))  #general stop words that we don't need to understand the context of the sentence and we will remove from our dataset

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [97]:
# Data processing

In [98]:
# loading the data from csv file thrugh pandas dataframe
twitter_data  = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding  = 'ISO-8859-1')

In [99]:
#checking the number of rows and columns
twitter_data.shape # so there are 16lakh tweets and 6 columns of it

(1599999, 6)

In [100]:
# printing first 5 rows of dataframe
twitter_data.head() # we can see that the columns name is not read by pandas and hence we will change it to read the column names

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [101]:
# naming the columns and reading the dataset again
column_names = ['target', 'id', 'date', 'flag' , 'user' , 'text']
twitter_data  = pd.read_csv('training.1600000.processed.noemoticon.csv', names = column_names,  encoding  = 'ISO-8859-1')

In [102]:
twitter_data.shape

(1600000, 6)

In [103]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [104]:
# counting the number of missing values in dataset
twitter_data.isnull().sum()  # no values are missing in dataset and hence no need to clean it

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [105]:
# checking the istribution of target columns
twitter_data['target'].value_counts()
# target is basically the sentimental label of each tweet
# 0 -> negative , 2 -> neutral , 4 -> positive
# and we can see that the data is distributed equally in 0 & 4
# if it wasn't equally distributed then we need to perform upsampling and downsampling

target
0    800000
4    800000
Name: count, dtype: int64

In [106]:
# we will convert the label of 4 to 1 so that 0 = negative and 1 = positive

twitter_data.replace({'target': {4:1}}, inplace = True)

# inplace = true saves the changes to the original dataset or otherwise it will create a new copy of the datset and make changes there

In [107]:
twitter_data['target'].value_counts() # now the target value of 4 is converted to 1

target
0    800000
1    800000
Name: count, dtype: int64

In [108]:
# 0 -> negative
# 1 -> positive

In [109]:
# Stemming is the process of reducing a word to its Root word

In [110]:
port_stem = PorterStemmer() # this will load the instance of PorterStemmer to variable

In [111]:
stop_words = set(stopwords.words('english'))

In [112]:
lemmatizer = WordNetLemmatizer()

In [113]:
def stemming(content):
    # Remove non-alphabetic characters
    stemmed_content = re.sub(r"http\S+", "", content) 
    stemmed_content = re.sub(r"@\w+", "", content) 
    stemmed_content = re.sub(r"#", "", content)
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    
    # Convert to lowercase
    stemmed_content = stemmed_content.lower()
    
    # Split into words
    words = stemmed_content.split()
    
    # Remove stopwords and apply stemming
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join back into a sentence
    stemmed_content = ' '.join(words)
    
    return stemmed_content

In [114]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming) # creating new column and putting data of text after applying stemming funtion to it

In [115]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dived many time ball managed save res...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see


In [116]:
print(twitter_data['stemmed_content'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset update facebook texting might cry result...
2          kenichan dived many time ball managed save res...
3                            whole body feel itchy like fire
4                           nationwideclass behaving mad see
                                 ...                        
1599995                        woke school best feeling ever
1599996    thewdb com cool hear old walt interview http b...
1599997                       ready mojo makeover ask detail
1599998    happy th birthday boo alll time tupac amaru sh...
1599999    happy charitytuesday thenspcc sparkscharity sp...
Name: stemmed_content, Length: 1600000, dtype: object


In [117]:
# seperating data and label removing unnecessary labels

In [118]:
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [119]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset update facebook texting might cry result school today also blah'
 'kenichan dived many time ball managed save rest go bound' ...
 'ready mojo makeover ask detail'
 'happy th birthday boo alll time tupac amaru shakur'
 'happy charitytuesday thenspcc sparkscharity speakinguph h']


In [120]:
print(Y)

[0 0 0 ... 1 1 1]


In [121]:
# Splitting the data to train and test

In [122]:
X_train, X_test , Y_train , Y_test = train_test_split(X,Y, test_size = 0.2 , stratify=Y, random_state = 2)
# since we are splitting data to 80 and 20 hence there are chaces that all the -ve or +ve goes to X or Y and hance to prevent it we use (stratify = y) to tell split that we want equal split of 0 and 1 in both train and test data
# since the ml model includes randomeness while training we use random_state to control it

#random_state sets the seed for the random number generator.
#Same random_state → same results
#Different random_state → different results
#No random_state → results change on every run

In [123]:
print(X.shape , X_train.shape , X_test.shape)

(1600000,) (1280000,) (320000,)


In [124]:
# using feature extraction ( vectorizer) converting test to vectors(numricals)

In [125]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),  max_features=300000, sublinear_tf=True)  # this is unigram + bigram
# this will check how many times a word is repeated and assign importance to it in form of nmerical values and hence like this each word will be given numeric value of importance

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# hene by using this our moddel can identify which important words are there in +ve tweets and which are there in -ve tweets and then by learning it we can perform further classification

In [126]:
print(X_test)

  (np.int32(0), np.int32(6444))	0.11303416105760496
  (np.int32(0), np.int32(15011))	0.10692331198940076
  (np.int32(0), np.int32(15949))	0.27409129406945576
  (np.int32(0), np.int32(36507))	0.18268324356711782
  (np.int32(0), np.int32(63322))	0.28253703939405195
  (np.int32(0), np.int32(84630))	0.16801065276139915
  (np.int32(0), np.int32(84713))	0.321245027292135
  (np.int32(0), np.int32(90719))	0.15573969270629467
  (np.int32(0), np.int32(90978))	0.23794376764856595
  (np.int32(0), np.int32(130255))	0.18482652242326722
  (np.int32(0), np.int32(130260))	0.3025788873313616
  (np.int32(0), np.int32(174123))	0.298113514218058
  (np.int32(0), np.int32(178645))	0.11715962938099221
  (np.int32(0), np.int32(179275))	0.22290833893454956
  (np.int32(0), np.int32(249601))	0.14451577707097396
  (np.int32(0), np.int32(249612))	0.27622235454411
  (np.int32(0), np.int32(255385))	0.273586291741099
  (np.int32(0), np.int32(262182))	0.17524641765473256
  (np.int32(0), np.int32(262338))	0.283268190560

In [127]:
# Training the ML model ( logistic regression ) it is a classification model

In [128]:
model = LogisticRegression(max_iter = 1000 , C=2.0, solver="liblinear" , class_weight="balanced")

In [129]:
model.fit(X_train , Y_train)

In [130]:
# Model Evaluation using accuracy score on training data

In [131]:
X_train_prediction = model.predict(X_train)  # this will store the values predictd by our model on the training data
training_data_accuracy = accuracy_score(Y_train , X_train_prediction)  # comparing X_train data with the true labels 

In [132]:
print(training_data_accuracy)

0.84328046875


In [133]:
X_test_prediction = model.predict(X_test)  # this will store the values predictd by our model on the test data
test_data_accuracy = accuracy_score(Y_test , X_test_prediction)

In [134]:
print(test_data_accuracy)

0.7940125


In [135]:
# Previous Model Accuracy = 77.66 %
# Current Model Accuracy  = 79.40 %

In [137]:
print(classification_report(Y_test, X_test_prediction))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79    160000
           1       0.79      0.81      0.80    160000

    accuracy                           0.79    320000
   macro avg       0.79      0.79      0.79    320000
weighted avg       0.79      0.79      0.79    320000



In [138]:
# saving the model
import pickle

In [140]:
filename = "trianed_model.sav"
pickle.dump(model, open(filename, 'wb'))