In [1]:
# Importing relevant libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Configuruing path to kaggle json file
import json
import os

# Ensuring the folder for Kaggle exists
os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)

# Moving the kaggle.json file to the ~/.kaggle/ directory
with open('kaggle.json', 'r') as f:
    creds = json.load(f)

# Creating the kaggle.json in the ~/.kaggle directory
with open(os.path.expanduser('~/.kaggle/kaggle.json'), 'w') as f:
    json.dump(creds, f)

# Setting the appropriate file permissions
os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)


In [3]:
# Using API to fetch dataset from Kaggle
! kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
# Extracting the compressed dataset
import zipfile

with zipfile.ZipFile('sentiment140.zip', 'r') as zip_ref:
    zip_ref.extractall()
    print("The dataset is extracted")

The dataset is extracted


In [6]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kukre\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [10]:
# Printing the stopwords in English
print(stopwords.words("English"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Data processing

In [14]:
# Loading the data
data = pd.read_csv(r"C:\Users\kukre\OneDrive\Documents\Season Of AI\training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1")


In [15]:
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [20]:
# Checking number of rows and columns
data.shape

(1599999, 6)

In [23]:
# First datapoint is taken as column names, so creating new list for original column names
data.columns=["target","ids","date","flag", "user","text"]

# Loading data again
data = pd.read_csv(r"C:\Users\kukre\OneDrive\Documents\Season Of AI\training.1600000.processed.noemoticon.csv",names=data.columns, encoding = "ISO-8859-1")


In [25]:
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [26]:
data.shape

(1600000, 6)

In [27]:
# Finding missing values
data.isna().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [28]:
# No missing values

In [30]:
# Checking distributuion of target column
data.target.value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [31]:
# Changing label "4" to "1"
data['target'].replace(4, 1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['target'].replace(4, 1, inplace=True)


In [33]:
data.target.value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

# Stemming

In [34]:
port_stem=PorterStemmer()

In [35]:
# Defining a function to reduce each word to it's root word
def stemming(content):
    stemmed_content=re.sub("[^a-zA-Z]", " ", content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words("english")]
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content
                        

In [None]:
# Applying the function to the text column of the dataset
data["stemmed_text"]=data["text"].apply(stemming)

In [None]:
data.head()

In [None]:
print(data.stemmed_text)

# Training the model

In [None]:
# Seperating features and label
features = data["stemmed_text"].values
label = data["target"].values

In [None]:
# Splitting the data for training
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(features,label,test_size=0.2, random_state = 2, stratify = label)

In [None]:
# Converting the textual data to numerical data for computation by the model
vectorizer=TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
# Since this is a binary classification problem, use LogisticRegression
from sklearn.linear_model import LogisticRegression()

model = LogisticRegression(max_iter = 1000)
model.fit(X_train, Y_train)

# Evaluation of the model

In [None]:
# Accuracy score for training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train,X_train_prediction)
print("Accuracy score of training data:", training_data_accuracy)

In [None]:
# Accuracy score for testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test,X_test_prediction)
print("Accuracy score of testing data:", test_data_accuracy)

In [None]:
# Since model is overfitted, try bagging to achieve a generalised model

# Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

algo = LogisticRegression()

ensembleModel = BaggingClassifier(n_estimators=42, base_estimator=algo)

ensembleModel.fit(X_train,y_train)

In [None]:
# Accuracy score for training data
X_train_prediction_bagging = ensembleModel.predict(X_train)
training_data_accuracy_bagging = accuracy_score(Y_train,X_train_prediction_bagging)
print("Accuracy score of training data:", training_data_accuracy_bagging)

In [None]:
# Accuracy score for testing data
X_test_prediction_bagging = ensebleModel.predict(X_test)
test_data_accuracy_bagging = accuracy_score(Y_test,X_test_prediction_bagging)
print("Accuracy score of testing data:", test_data_accuracy_bagging)

# Saving the trained model

In [None]:
import pickle

In [None]:
filename= "twitter_sentiment_analysis_model.sav"
pickle.dump(model,open(filename,"wb"))

# Using the saved model for predictions

In [None]:
# loading the saved model
loaded_model = pickle.load(open("twitter_sentiment_analysis_model.sav", "rb"))

In [None]:
X_new = X_test[200]