In [1]:
!pip install kaggle



In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
# API to fetch the dataset from kaggle

In [4]:
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 90% 73.0M/80.9M [00:00<00:00, 121MB/s]
100% 80.9M/80.9M [00:00<00:00, 126MB/s]


In [5]:
# Extracting the compressed dataset

In [6]:
!unzip sentiment140.zip

Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [7]:
# Importing the dependencies for the Projects

In [8]:
import numpy as np
import pandas as pd
import re # for regular expresssion - pattern matching
from nltk.corpus import stopwords # for removing the un-influential words from the data, as they're of no use.
from nltk.stem.porter import PorterStemmer # reducig the words to its root words
from sklearn.feature_extraction.text import TfidfVectorizer # for converting the textual data into numerical form
from sklearn.model_selection import train_test_split # for splitting the training data into 3 parts : train, test and split.
from sklearn.linear_model import LogisticRegression # using the Logistic Regression ML model
from sklearn.metrics import accuracy_score # for determining the accuracy

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
# Printing the stopwords in English
print(stopwords.words('english'))
# stopwords are those words in a sentence which doesn't carries any influential meaning to the data
# hence these words will be removed from the data as they're not useful

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Processing

In [11]:
# loading the data

In [13]:
twitter_data = pd.read_csv('/content/twitter_data.csv', encoding='ISO-8859-1')

In [14]:
# checking its shape
twitter_data.shape

(1599999, 6)

In [15]:
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [16]:
# since there's no col names, manually writing the col names
columns=['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('/content/twitter_data.csv',names = columns, encoding='ISO-8859-1')

In [17]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [18]:
# Counting the number of missing values in the dataset
twitter_data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [19]:
# checking the distribution of the target column
twitter_data['target'].value_counts()
# here 0 -> means -ve
# and 4 -> means +ve

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [20]:
# converting 4 to 1 for better understanding
twitter_data.replace({'target':{4:1}},inplace=True)

In [21]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


0 --> Negative Tweet

1 --> Positive Tweet

# **Stemming**

# Stemming is the process of reducing a words to its key word or root-word

In [22]:
# example : actor, actress, acting (words with similar meaning) = act (root word)

In [23]:
port_stem = PorterStemmer()

In [24]:
def stemming(content): # here 'content' is the 'text' col from the dataset.

  stemmed_content = re.sub('[^a-zA-Z]', ' ', content) # here we are removing all the characters which are not (a-z) & (A-Z)
  # It is used to remove all the  punctuations and other symbols from the given text, which are not alphabets

  stemmed_content = stemmed_content.lower() # converting to lowercase

  stemmed_content = stemmed_content.split() # splitting the words from the processed text to form a list

  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  # here we are using the 'stopwords', means we are checking whether the 'stemmed_content' contains the stopwords or not
  # If the 'stemmed_content' doesn't contains any stopwords,
  # then the stemming function is applied over it to reduce the word to its root word

  stemmed_content = ' '.join(stemmed_content)
  # after the processing, we are again joining those words together

  return stemmed_content

In [None]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [None]:
temp_df = twitter_data['stemmed_content']

In [None]:
 print(twitter_data['stemmed_content'])

In [None]:
print(twitter_data['target'])

In [None]:
# Separating the data and label
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [None]:
 print(X)

In [None]:
print(Y)

# Splitting the data to training data & test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

## **test_size = 0.2**

'test_size = 0.2' means 20% percent of data will go for test data and remaining 80% of data will go to training data

## **stratify = Y**

we are performing stratify on the data 'Y' because we need equal proportion of 2 classes (0 & 1) to go to both training & test data.

we don't want it to be like all positive (1) go to training and all the negatives (0) to go to test data

## **random_state = 2**

It is used to separate the data in a random manner

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
print(Y.shape, Y_train.shape, Y_test.shape)

# Converting the textual data to numerical data

converting the text to Numerical Values for ML model to understand ,
here we are going to use a process called "Feature Extraction"

## using the (vectorizer.fit_transform()) process each word in the training and test data will be given a weight/importance based on how many times it has been repeated, then the model will understand the sentiment of the word using this method.
Example : if a word 'happy' is present 5 times and word 'fine' is present 2 times, then (vectorizer.fit_transform()) method will assign it 5 and 2 respectively. while training the model will use these numerical values (5,2) to understand that the word happy means more positive and fine means less positive

In [None]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train) # here 2 operations are being performed (a) fitting (b) transforming
# (a) fitting is process of understanding the nature of the data
# (b) transforming is conversion of the textual data to numerical values
X_test = vectorizer.transform(X_test)
# here the test data is only transformed and not fitted since we already perfromed the fitting
# here the test data is transformed based on the fitting of the Training data

In [None]:
print(X_train)

In [None]:
print(X_test)

# **Training the Machine Learning Model**

**Logistic Regression**

In [None]:
model = LogisticRegression(max_iter = 1000)

In [None]:
model.fit(X_train, Y_train)
# X_train is the training data tweets
# Y_train is the target for the training data tweets

# **Model Evaluation**

AccuracyScore : we are going to use AccuracyScore for Evaluating the accuracy of the model

In [None]:
X_train_prediction = model.predict(X_train) # here the trained model predicts the output for X_train data
training_data_accuracy = accuracy_score(Y_train, X_train_prediction) # we are now comparing the prediction with the true labels

In [None]:
print("Accuracy score on the training data :", training_data_accuracy)

# Testing the accuracy on the test data

In [None]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [None]:
print("Accuracy score on the tes data :", test_data_accuracy)
# here our model has performed well as the accuracy of training data and of the test data is very close to each other

## Overfitting occurs when the accuracy of the training data is very high while accuracy of test data is very low

Model Accuracy = 77.6 %

# Exporting the Model

In [None]:
import pickle

In [None]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

# Using the Saved Model for future Predictions

In [None]:
# loading the saved model
loaded_model = pickle.load(open('/content/trained_model.sav', 'rb'))

In [None]:
X_new = X_test[200]
print(Y_test[200])

In [None]:
prediction = loaded_model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print("Negative Tweet")
else:
  print('Positive Tweet')