### Importing all the packages

In [1]:
import os
import shutil
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
from zipfile import ZipFile

Uploading kaggle.json file

In [2]:
# Define the path to the kaggle.json file
kaggle_json_path ="C:\\Users\\abish\\Downloads\\kaggle.json"

# Create the .kaggle directory if it doesn't exist
kaggle_dir = os.path.join(os.path.expanduser("~"), ".kaggle")
os.makedirs(kaggle_dir, exist_ok=True)

# Copy the kaggle.json file to the .kaggle directory
shutil.copy(kaggle_json_path, kaggle_dir)

# Set file permissions (not necessary on Windows, but can be kept)
os.chmod(os.path.join(kaggle_dir, "kaggle.json"), 0o600)


In [3]:
# Unzipping the dataset
dataset = "D:\\Data Science\\Project7\\sentiment140.zip"
with ZipFile(dataset, 'r') as zip:
    zip.extractall()
    print('The data is extracted')

The data is extracted


## Stop words

In [4]:
# Download NLTK stopwords if not available
nltk.download('stopwords')

# Print stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Datapreprocessing

In [5]:
# Load dataset
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv("D:\\Data Science\\Project7\\training.1600000.processed.noemoticon.csv", names=column_names, encoding='ISO-8859-1')

In [32]:
twitter_data

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see
...,...,...,...,...,...,...,...
1599995,1,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...,woke school best feel ever
1599996,1,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,thewdb com cool hear old walt interview http b...
1599997,1,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,readi mojo makeov ask detail
1599998,1,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,happi th birthday boo alll time tupac amaru sh...


## Converting the target column 4 to 1

In [6]:
twitter_data.replace({'target': {4: 1}}, inplace=True)

## Stemming

In [7]:
# Porter Stemmer
port_stem = PorterStemmer()

In [8]:
def stemming(content):
    # Remove special characters and non-alphabetic characters
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    # Convert to lowercase and split into words
    stemmed_content = stemmed_content.lower().split()
    # Stemming
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
    return ' '.join(stemmed_content)


In [9]:
# Apply stemming function to the 'text' column
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [10]:
# Separating the data and labels
x = twitter_data['stemmed_content'].values
y = twitter_data['target'].values

In [11]:
print(x)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [12]:
print(y)

[0 0 0 ... 1 1 1]


In [13]:
# Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, stratify=y, random_state=2)

In [14]:
print(x.shape,x_train.shape,x_test.shape)

(1600000,) (1440000,) (160000,)


In [15]:
print(x_train)

['feel like work becuz im effin horni'
 'fatman got littl jew fro shave damn'
 'freemissryd come home alreadi life bore without around' ...
 'fenner cool quot add quot okay know quot quot refer account name quot quot still sound funni come'
 'six pack juli ab beer even like beer punish'
 'love wake folger bad voic deeper']


In [16]:
print(x_test)

['alexvonvaupel share deet photo renku project soon interest prob worth explor broad scale'
 'orthodontist appt go boo hoo done brace serious wait killin em smile haha'
 'n quad' ... 'hmmm need review previou word aim write anoth end day'
 'divebunni poor great select wine socal'
 'musiclikemerci glad came watch last five year anoth night board meet monday night']


In [17]:
# Converting text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [18]:
print(x_train)

  (0, 182979)	0.5043398323876811
  (0, 124178)	0.5009098608163001
  (0, 191949)	0.23658323758073335
  (0, 39517)	0.5442402674140161
  (0, 480730)	0.20791176343014625
  (0, 253306)	0.20989916743831274
  (0, 140414)	0.23510973174492172
  (1, 96496)	0.24756740179431355
  (1, 391527)	0.37884912323076886
  (1, 150159)	0.43166246378558804
  (1, 211616)	0.4650971855507986
  (1, 256084)	0.2324974076628597
  (1, 163657)	0.1790916056140654
  (1, 139329)	0.5535404239764236
  (2, 25322)	0.29831434367036275
  (2, 478718)	0.30928207079919917
  (2, 52422)	0.2744074166366401
  (2, 252766)	0.2691390741081691
  (2, 13488)	0.2746918607123101
  (2, 181827)	0.22667234804878209
  (2, 84950)	0.228293337690162
  (2, 149200)	0.6990451071142899
  (3, 443848)	0.4086851938350052
  (3, 105553)	0.912675414558782
  (4, 258806)	0.20025436326440765
  :	:
  (1439997, 306114)	0.17019026786338842
  (1439997, 406947)	0.15350608814882138
  (1439997, 4141)	0.18002122839570336
  (1439997, 324019)	0.1662086339046958
  (143999

In [19]:
print(x_test)

  (0, 481168)	0.23330852861538307
  (0, 405860)	0.17677631588426657
  (0, 390959)	0.22842923564835296
  (0, 383145)	0.3203346684648424
  (0, 350126)	0.23543097744137334
  (0, 349704)	0.24634564528631475
  (0, 340148)	0.2146426760569468
  (0, 195530)	0.21918922765221482
  (0, 136027)	0.29348269692148443
  (0, 103849)	0.3697984462037016
  (0, 57292)	0.34923184414720254
  (0, 10986)	0.45215911572223216
  (1, 469696)	0.18042512989832024
  (1, 402402)	0.25442628643620585
  (1, 388260)	0.25279780564794735
  (1, 328292)	0.40325543221890114
  (1, 234424)	0.3614926380681113
  (1, 182386)	0.304642448001017
  (1, 169850)	0.1819845802403651
  (1, 161559)	0.12878616634004
  (1, 127433)	0.2673950821601698
  (1, 114644)	0.20813228546942744
  (1, 53782)	0.33723657814570324
  (1, 51593)	0.24730132256781706
  (1, 23344)	0.33659845294991875
  :	:
  (159997, 365799)	0.3677131152675787
  (159997, 348787)	0.42327488408005537
  (159997, 309208)	0.21462407458127927
  (159997, 180530)	0.3521220681437496
  (159

## Using ML Model

### Logistic Regression

In [20]:
# Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

## Model Evaluation

In [21]:
# Model evaluation on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(y_train, x_train_prediction)
print('Accuracy score on the training data:', training_data_accuracy)

Accuracy score on the training data: 0.8037861111111111


In [22]:
# Model evaluation on testing data
x_test_prediction = model.predict(x_test)
testing_data_accuracy = accuracy_score(y_test, x_test_prediction)
print('Accuracy score for the testing data:', testing_data_accuracy)

Accuracy score for the testing data: 0.77889375


In [23]:
# Save the trained model
name = "trained_model.pkl"
pickle.dump(model, open(name, 'wb'))

## Using the saved model for future prediction

In [24]:
# Load the saved model and make predictions
loaded_model = pickle.load(open("trained_model.pkl", 'rb'))

In [30]:
# Get the index of the tweet in the original dataset
tweet_index = np.where(y_test == y_test[100])[0][0]  # Get the position of the specific test sample
x_new = twitter_data['text'].iloc[tweet_index]  # Use this index to access the original text
print("Actual label:", y_test[100])

# Transform the new example using the vectorizer
x_new_transformed = vectorizer.transform([x_new])  # Use brackets to create a list
prediction = loaded_model.predict(x_new_transformed)

print("Prediction:", prediction)

if prediction[0] == 0:
    print("negative tweet")
elif prediction[0] == 1:
    print("positive tweet")
else:
    print("neutral tweet")


Actual label: 1
Prediction: [0]
negative tweet
