**Adding keggle.json file to fetch Dataset using API**

In [1]:
# Create the Kaggle directory if it doesn't exist
!mkdir -p ~/.kaggle

# Copy the kaggle.json file (make sure you have the file in your current directory)
!cp kaggle.json ~/.kaggle/

# Change the file permission to make it secure
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!pip install kaggle


Defaulting to user installation because normal site-packages is not writeable


In [None]:
#Download the Dataset using API

In [3]:
!kaggle datasets download -d kazanova/sentiment140


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
#Unzipping the Dataset

In [4]:
from zipfile import ZipFile
dataset='sentiment140.zip'
with ZipFile(dataset,'r') as zip:
    zip.extractall()
    print("dataset is extracted")

dataset is extracted


**Importing dependencies**

In [5]:
import numpy as np
import pandas as pd 

In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ajay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

**Preprocessing on the Dataset**

In [9]:
twitter_data=pd.read_csv("training.1600000.processed.noemoticon.csv",encoding='ISO-8859-1')

In [10]:
twitter_data.shape

(1599999, 6)

In [11]:
twitter_data.head(5)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [None]:
#Dataset don't have column name so providing column name

In [12]:
column_names=['target','id','date','flag','user','text']
twitter_data=pd.read_csv("training.1600000.processed.noemoticon.csv",names=column_names,encoding='ISO-8859-1')

In [13]:
twitter_data.head(2)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...


In [14]:
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [15]:
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [None]:
#replacing target value of positive comment from 4 to 1

In [16]:
twitter_data.replace({'target':{4:1}},inplace=True)

In [17]:
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [18]:
#Stemming
#stemmeng is a process of reducing a word to its Root word

In [19]:
port_stem=PorterStemmer()

In [20]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)

    return stemmed_content 

In [21]:
twitter_data['stemmed_content']=twitter_data['text'].apply(stemming)

In [22]:
twitter_data.head(5)

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [23]:
print(twitter_data['stemmed_content'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object


In [24]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


**Preparing for traing and testing**

In [25]:
X=twitter_data['stemmed_content'].values
Y=twitter_data['target'].values

In [26]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [27]:
print(Y)

[0 0 0 ... 1 1 1]


In [None]:
#spliting the dataset into test and train

In [28]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [29]:
print(X.shape,X_train.shape,X_test.shape)

(1600000,) (1280000,) (320000,)


In [30]:
#converting textual data to numerical data

In [31]:
vectorizer=TfidfVectorizer()

X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [32]:
print(X_train)

  (0, 436713)	0.27259876264838384
  (0, 354543)	0.3588091611460021
  (0, 185193)	0.5277679060576009
  (0, 109306)	0.3753708587402299
  (0, 235045)	0.41996827700291095
  (0, 443066)	0.4484755317023172
  (1, 160636)	1.0
  (2, 109306)	0.4591176413728317
  (2, 124484)	0.1892155960801415
  (2, 407301)	0.18709338684973031
  (2, 129411)	0.29074192727957143
  (2, 406399)	0.32105459490875526
  (2, 433560)	0.3296595898028565
  (2, 77929)	0.31284080750346344
  (2, 443430)	0.3348599670252845
  (2, 266729)	0.24123230668976975
  (2, 409143)	0.15169282335109835
  (2, 178061)	0.1619010109445149
  (2, 150715)	0.18803850583207948
  (2, 132311)	0.2028971570399794
  (2, 288470)	0.16786949597862733
  (3, 406399)	0.29029991238662284
  (3, 158711)	0.4456939372299574
  (3, 151770)	0.278559647704793
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 318303)	0.21254698865277744
  (1279996, 434014)	0.27189450523324465
  (1279996, 390130)	0.2206474219107611
  (1279996, 373144)	0.35212500999832036
  (1279996, 23807

In [33]:
#training the model

In [34]:
model =LogisticRegression()

In [None]:
#fitting the model

In [35]:
model.fit(X_train,Y_train)

import tkinter as tk
from tkinter import messagebox
import joblib  # Assuming you used joblib to save your model
import re
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pickle

# Load your pre-trained model
model = pickle.load(open('/home/ajay/Desktop/mini_project_gui/trained_model.sav', 'rb'))

# Load your CountVectorizer or any other transformer if you used one
vectorizer = pickle.load(open('/home/ajay/Desktop/mini_project_gui/vectorizer.sav', 'rb'))

import nltk
nltk.download('stopwords')

port_stem = PorterStemmer()

def preprocess_tweet(tweet):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', tweet)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)

    return stemmed_content 

# Function to predict sentiment
def predict_sentiment():
    tweet = entry.get("1.0", "end-1c")  # Get text from the Text widget
    if tweet.strip() == "":
        messagebox.showwarning("Input Error", "Please enter a tweet.")
        return
    
    cleaned_tweet = preprocess_tweet(tweet)
    tweet_vector = vectorizer.transform([cleaned_tweet])  # Vectorize the tweet
    
    prediction = model.predict(tweet_vector)[0]
    
    if prediction == 1:
        result.set("☺ Positive Tweet")
        result_label.config(fg="green")
    else:
        result.set("☹️ Negative Tweet")
        result_label.config(fg="red")

# Function to clear input and result
def clear_text():
    entry.delete('1.0', tk.END)
    result.set("")



# Set up the GUI environment
root = tk.Tk()
root.title("Twitter Sentiment Analysis")
root.geometry('500x400')  # Set window size
root.configure(bg='#C4D7FF')  # Set background color (mixture of #FFF7D1 and #FFECC8)

# Add a title label
title_label = tk.Label(root, text="Twitter Sentiment Analysis", font=("Helvetica", 18, "bold"), bg='#C4D7FF', fg='#333')
title_label.pack(pady=20)

# Add a label for the tweet input
input_label = tk.Label(root, text="Enter a Tweet:", font=("Helvetica", 12), bg='#C4D7FF')
input_label.pack()

# Textbox to enter tweet
entry = tk.Text(root, width=50, height=5, font=("Helvetica", 12), bd=2)
entry.pack(pady=10)

# Button to predict sentiment
predict_button = tk.Button(root, text="Predict Sentiment", font=("Helvetica", 12), command=predict_sentiment, bg='#007BFF', fg='white', bd=0, padx=20, pady=5)
predict_button.pack(pady=15)

# Button to clear input and output
clear_button = tk.Button(root, text="Clear", font=("Helvetica", 12), command=clear_text, bg='#6c757d', fg='white', bd=0, padx=20, pady=5)
clear_button.pack(pady=5)

# Label to display the prediction result
result = tk.StringVar()
result_label = tk.Label(root, textvariable=result, font=("Helvetica", 16, "bold"), bg='#C4D7FF')
result_label.pack(pady=20)

# Start the GUI loop
root.mainloop()


In [36]:
#model evalution

In [37]:
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(Y_train,X_train_prediction)

In [38]:
print('Accuracy score on the training data: ',training_data_accuracy)

Accuracy score on the training data:  0.79871953125


In [39]:
X_test_prediction=model.predict(X_test)
testing_data_accuracy=accuracy_score(Y_test,X_test_prediction)

In [40]:
print('Accuracy score on the testing data: ',testing_data_accuracy)

Accuracy score on the testing data:  0.77668125


In [41]:
#saving the model and vectorizer using pickle library

In [42]:
import pickle

In [43]:
with open('vectorizer.sav', 'wb') as f:
    pickle.dump(vectorizer, f)

In [44]:
filename='trained_model.sav'
pickle.dump(model,open(filename,'wb'))