## **Libraries**


In [None]:
#For Cleaning
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
#For Models
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
#Downloads
nltk.download('stopwords')

## **Data Cleaning**

In [None]:
data = pd.read_csv('/content/sentimentdataset.csv')

#Removing non-important columns
data = data.drop(columns=['Timestamp'])
data = data.drop(columns=['ID'])
data = data.drop(columns=['User'])
data = data.drop(columns=['Topic'])
data = data.drop(columns=['Source'])
data = data.drop(columns=['Country'])
data = data.drop(columns=['Year'])
data = data.drop(columns=['Month'])
data = data.drop(columns=['Day'])
data = data.drop(columns=['Hour'])
data = data.drop(columns=['Retweets'])
data = data.drop(columns=['Likes'])


In [None]:
# removing any non-alphapetical charachters from text column
data['Text'] = [re.sub('[^a-zA-Z]', ' ', line).lower() for line in data['Text']]
data.head()

Unnamed: 0,Text,Sentiment (Label)
0,enjoying a beautiful day at the park ...,Positive
1,traffic was terrible this morning ...,Negative
2,just finished an amazing workout ...,Positive
3,excited about the upcoming weekend getaway ...,Positive
4,trying out a new recipe for dinner tonight ...,Neutral


In [None]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# removing any stop_word
for idx, line in enumerate(data['Text']):
    words = line.split()  # Split the line into words
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    # Join the filtered words back into a line and update the DataFrame
    data.at[idx, 'Text'] = ' '.join(filtered_words)
data.head()

Unnamed: 0,Text,Sentiment (Label)
0,enjoying beautiful day park,Positive
1,traffic terrible morning,Negative
2,finished amazing workout,Positive
3,excited upcoming weekend getaway,Positive
4,trying new recipe dinner tonight,Neutral


In [None]:
#Checking for null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Text               732 non-null    object
 1   Sentiment (Label)  732 non-null    object
dtypes: object(2)
memory usage: 11.6+ KB


In [None]:
#Checking for duplicate rows
duplicates_count = data.duplicated().sum()
print(duplicates_count)

24


In [None]:
#Dropping duplicates
data.drop_duplicates(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 708 entries, 0 to 731
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Text               708 non-null    object
 1   Sentiment (Label)  708 non-null    object
dtypes: object(2)
memory usage: 16.6+ KB


In [None]:
#Split each row in column text
data['Text'] = [line.split() for line in data['Text']]
data

Unnamed: 0,Text,Sentiment (Label)
0,"[enjoying, beautiful, day, park]",Positive
1,"[traffic, terrible, morning]",Negative
2,"[finished, amazing, workout]",Positive
3,"[excited, upcoming, weekend, getaway]",Positive
4,"[trying, new, recipe, dinner, tonight]",Neutral
...,...,...
727,"[collaborating, science, project, received, re...",Happy
728,"[attending, surprise, birthday, party, organiz...",Happy
729,"[successfully, fundraising, school, charity, i...",Happy
730,"[participating, multicultural, festival, celeb...",Happy


In [None]:
#using porter stemmer to reduce words to their base form and putting them in a new column
ps = PorterStemmer()

def stem_words(word_list):
    if isinstance(word_list, list):
        return [ps.stem(word) for word in word_list]
    else:
        return []

data['Stemmed_Text'] = data['Text'].apply(stem_words)
data.head(10)

Unnamed: 0,Text,Sentiment (Label),Stemmed_Text
0,"[enjoying, beautiful, day, park]",Positive,"[enjoy, beauti, day, park]"
1,"[traffic, terrible, morning]",Negative,"[traffic, terribl, morn]"
2,"[finished, amazing, workout]",Positive,"[finish, amaz, workout]"
3,"[excited, upcoming, weekend, getaway]",Positive,"[excit, upcom, weekend, getaway]"
4,"[trying, new, recipe, dinner, tonight]",Neutral,"[tri, new, recip, dinner, tonight]"
5,"[feeling, grateful, little, things, life]",Positive,"[feel, grate, littl, thing, life]"
6,"[rainy, days, call, cozy, blankets, hot, cocoa]",Positive,"[raini, day, call, cozi, blanket, hot, cocoa]"
7,"[new, movie, release, must, watch]",Positive,"[new, movi, releas, must, watch]"
8,"[political, discussions, heating, timeline]",Negative,"[polit, discuss, heat, timelin]"
9,"[missing, summer, vibes, beach, days]",Neutral,"[miss, summer, vibe, beach, day]"


In [None]:
#Lower casing Sentiment (Label) column
data['Sentiment (Label)'] = [re.sub('[^a-zA-Z]', ' ', line).lower() for line in data['Sentiment (Label)']]
data.head()

Unnamed: 0,Text,Sentiment (Label),Stemmed_Text
0,"[enjoying, beautiful, day, park]",positive,"[enjoy, beauti, day, park]"
1,"[traffic, terrible, morning]",negative,"[traffic, terribl, morn]"
2,"[finished, amazing, workout]",positive,"[finish, amaz, workout]"
3,"[excited, upcoming, weekend, getaway]",positive,"[excit, upcom, weekend, getaway]"
4,"[trying, new, recipe, dinner, tonight]",neutral,"[tri, new, recip, dinner, tonight]"


In [None]:
#unique values in Sentiment (Label) column
print(data['Sentiment (Label)'].unique())

[' positive  ' ' negative  ' ' neutral   ' ' anger        '
 ' fear         ' ' sadness      ' ' disgust      ' ' happiness    '
 ' joy          ' ' love         ' ' amusement    ' ' enjoyment    '
 ' admiration   ' ' affection    ' ' awe          ' ' disappointed '
 ' surprise     ' ' acceptance   ' ' adoration    ' ' anticipation '
 ' bitter       ' ' calmness     ' ' confusion    ' ' excitement   '
 ' kind         ' ' pride        ' ' shame        ' ' confusion '
 ' excitement ' ' shame ' ' elation       ' ' euphoria      '
 ' contentment   ' ' serenity      ' ' gratitude     ' ' hope          '
 ' empowerment   ' ' compassion    ' ' tenderness    ' ' arousal       '
 ' enthusiasm    ' ' fulfillment  ' ' reverence     ' ' compassion'
 ' fulfillment   ' ' reverence ' ' elation   ' ' despair         '
 ' grief           ' ' loneliness      ' ' jealousy        '
 ' resentment      ' ' frustration     ' ' boredom         '
 ' anxiety         ' ' intimidation    ' ' helplessness    '
 ' 

In [None]:

#Categorize into POSITIVE or NEGATIVE or NEUTRAL
positive_keywords = { 'positive', 'happiness', 'joy', 'love', 'amusement','enjoyment', 'admiration', 'excitement', 'kind', 'pride', 'gratitude', 'hope',
                    'empowerment', 'arousal', 'enthusiasm', 'hopeful', 'proud', 'grateful', 'free', 'inspired',
                      'overjoyed', 'inspiration', 'motivation', 'joyfulreunion', 'satisfaction', 'blessed', 'optimism',
                       'enchantment', 'playfuljoy', 'dreamchaser', 'thrill', 'creativity',
                      'adventure', 'euphoria', 'festivejoy', 'freedom', 'artisticburst', 'marvel', 'positivity',
                      'kindness', 'friendship', 'success', 'amazement', 'celebration', 'charm', 'ecstasy',
                      'iconic', 'engagement', 'touched', 'heartwarming',
                      'renewed effort', 'thrilling journey', 'celestial wonder', 'creative inspiration',
                     'runway creativity', 'relief', 'happy', 'elation', 'contentment', 'reverence', 'dazzle'}

negative_keywords = {'negative', 'anger', 'fear', 'sadness', 'disgust', 'awe', 'disappointment', 'bitterness', 'shame', 'despair', 'grief', 'loneliness', 'jealousy',
                     'resentment', 'frustration', 'boredom', 'anxiety', 'intimidation', 'helplessness', 'envy', 'regret', 'melancholy', 'exhaustion', 'sorrow', 'darkness',
                   'desperation', 'desolation', 'heartbreak','overwhelmed', 'devastated', 'betrayal', 'suffering', 'isolation', 'suspense'}

def classify_sentiment(label):
    #Split column elements
    label = label.split()

    # Check if any positive keywords are in the label
    if set(label).intersection(positive_keywords):
        return 'positive'

    # Check if any negative keywords are in the label
    elif set(label).intersection(negative_keywords):
        return 'negative'

    return 'neutral'



# Apply the classify_sentiment function to the 'Sentiment (Label)' column
data['Sentiment_Class'] = data['Sentiment (Label)'].apply(classify_sentiment)

data

Unnamed: 0,Text,Sentiment (Label),Stemmed_Text,Sentiment_Class
0,"[enjoying, beautiful, day, park]",positive,"[enjoy, beauti, day, park]",positive
1,"[traffic, terrible, morning]",negative,"[traffic, terribl, morn]",negative
2,"[finished, amazing, workout]",positive,"[finish, amaz, workout]",positive
3,"[excited, upcoming, weekend, getaway]",positive,"[excit, upcom, weekend, getaway]",positive
4,"[trying, new, recipe, dinner, tonight]",neutral,"[tri, new, recip, dinner, tonight]",neutral
...,...,...,...,...
727,"[collaborating, science, project, received, re...",happy,"[collabor, scienc, project, receiv, recognit, ...",positive
728,"[attending, surprise, birthday, party, organiz...",happy,"[attend, surpris, birthday, parti, organ, frie...",positive
729,"[successfully, fundraising, school, charity, i...",happy,"[success, fundrais, school, chariti, initi, jo...",positive
730,"[participating, multicultural, festival, celeb...",happy,"[particip, multicultur, festiv, celebr, divers...",positive


In [None]:

X = data['Stemmed_Text']  # Feature (text)
y = data['Sentiment_Class']  # Target labels (positive, negative, neutral)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=25)


In [None]:
# Convert lists of lists to lists of strings
X_train = [" ".join(doc) for doc in X_train]
X_test = [" ".join(doc) for doc in X_test]

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1700)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)



## **Models**

# SVM

In [None]:
# Initialize the SVM classifier
svm_classifier = SVC(kernel = 'linear')

# Train the model
svm_classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_Test = svm_classifier.predict(X_test_tfidf)
y_pred_Train = svm_classifier.predict(X_train_tfidf)

print(classification_report(y_test, y_pred_Test))
print("Test Accuracy:", accuracy_score(y_test, y_pred_Test))

print(classification_report(y_train, y_pred_Train))
print("Train Accuracy:", accuracy_score(y_train, y_pred_Train))

# Naive Bayes

In [None]:

# Define the Naive Bayes pipeline with TF-IDF vectorizer and Multinomial Naive Bayes classifier
nb_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),  # TF-IDF vectorizer
    ('nb', MultinomialNB())  # Multinomial Naive Bayes classifier
])

# Train the Naive Bayes model
nb_pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred_nb = nb_pipeline.predict(X_test)
y_pred_nb_train = nb_pipeline.predict(X_train)

# Calculate accuracy
accuracy_nb = accuracy_score(y_test, y_pred_nb)
accuracy_nb_train = accuracy_score(y_train, y_pred_nb_train)

print("Naive Bayes Test Accuracy:", accuracy_nb)
print("Naive Bayes Train Accuracy:", accuracy_nb_train)


Naive Bayes Test Accuracy: 0.719626168224299
Naive Bayes Train Accuracy: 0.9068219633943427


# Logistic Regression

In [None]:
# Define the Logistic Regression pipeline with TF-IDF vectorizer and Logistic Regression classifier
logreg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1700)),  # Convert text to TF-IDF features
    ('logreg', LogisticRegression(max_iter=1000))  # Logistic Regression classifier
])

# Train the Logistic Regression model
logreg_pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred_logreg = logreg_pipeline.predict(X_test)
y_pred_logreg_Train = logreg_pipeline.predict(X_train)


# Calculate accuracy
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
accuracy_logreg_Train = accuracy_score(y_train, y_pred_logreg_Train)

print("Logistic Regression Test Accuracy:", accuracy_logreg)
print("Logistic Regression train Accuracy:", accuracy_logreg_Train)


Logistic Regression Test Accuracy: 0.7663551401869159
Logistic Regression train Accuracy: 0.9550748752079867


## **GUI**

In [None]:
%%writefile app.py

Writing app.py


In [None]:
pip install streamlit


Collecting streamlit
  Downloading streamlit-1.35.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0

In [None]:
! wget -q -o - ipv4.incanhazip.com


In [None]:
! wget -q -O - https://loca.lt/mytunnelpassword


34.86.183.101

In [None]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.86.183.101:8501[0m
[0m
[K[?25hnpx: installed 22 in 3.86s
your url is: https://cool-keys-drum.loca.lt
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Accuracy: 0.7663551401869159
Accuracy: 0.9434276206322796
              precision    recall  f1-score   support

    negative       0.92      0.60      0.73        20
     neutral       0.71      0.74      0.73        43
    positive       0.78      0.86      0.82        44

    accuracy                           0.77       107
   macro avg       0.80      0.74      0.76       107
weighted avg       0.78      0.77      0.76       107

[nltk_data] Downloading package