In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping



In [None]:

df = pd.read_csv('Sentiment.csv')

In [None]:
df

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0000,yes,1.0000,Neutral,0.6578,None of the above,1.0000,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0000,yes,1.0000,Positive,0.6333,None of the above,1.0000,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0000,yes,1.0000,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0000,yes,1.0000,Positive,1.0000,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0000,yes,1.0000,Positive,0.7045,None of the above,1.0000,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13866,13867,No candidate mentioned,1.0000,yes,1.0000,Negative,0.7991,Abortion,0.6014,No candidate mentioned,...,yes,7,Negative,Abortion\nWomen's Issues (not abortion though),RT @cappy_yarbrough: Love to see men who will ...,,2015-08-07 09:29:43 -0700,629690895479250944,Como,
13867,13868,Mike Huckabee,0.9611,yes,1.0000,Positive,0.7302,None of the above,0.9229,Mike Huckabee,...,yes,1,,,RT @georgehenryw: Who thought Huckabee exceede...,,2015-08-07 09:25:02 -0700,629689719056568320,USA,
13868,13869,Ted Cruz,1.0000,yes,1.0000,Positive,0.8051,None of the above,0.9647,Ted Cruz,...,yes,67,Positive\nNeutral,,"RT @Lrihendry: #TedCruz As President, I will a...",,2015-08-07 07:19:18 -0700,629658075784282112,,
13869,13870,Donald Trump,1.0000,yes,1.0000,Negative,1.0000,Women's Issues (not abortion though),0.9202,Donald Trump,...,yes,149,,Women's Issues (not abortion though),RT @JRehling: #GOPDebate Donald Trump says tha...,,2015-08-07 09:54:04 -0700,629697023663546368,,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13871 entries, 0 to 13870
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         13871 non-null  int64  
 1   candidate                  13775 non-null  object 
 2   candidate_confidence       13871 non-null  float64
 3   relevant_yn                13871 non-null  object 
 4   relevant_yn_confidence     13871 non-null  float64
 5   sentiment                  13871 non-null  object 
 6   sentiment_confidence       13871 non-null  float64
 7   subject_matter             13545 non-null  object 
 8   subject_matter_confidence  13871 non-null  float64
 9   candidate_gold             28 non-null     object 
 10  name                       13871 non-null  object 
 11  relevant_yn_gold           32 non-null     object 
 12  retweet_count              13871 non-null  int64  
 13  sentiment_gold             15 non-null     obj

In [None]:

def preprocess_text(text):
    ps = PorterStemmer()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text


In [None]:
!pip install nltk
!python -m nltk.downloader stopwords
df['text'] = df['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:

X = df['text']
Y = df['sentiment']
encoder = LabelEncoder()
Y = encoder.fit_transform(Y)
Y = pd.get_dummies(Y).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [None]:

max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


In [None]:


maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)


In [None]:

model = Sequential([
    Embedding(max_features, 128, input_length=maxlen),
    SpatialDropout1D(0.4),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(3, activation='softmax')
])


In [None]:

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [None]:

batch_size = 64
epochs = 15
history = model.fit(X_train, Y_train, validation_split=0.1, epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15


In [None]:

score = model.evaluate(X_test, Y_test, verbose=0)
print("Test Loss:", score[0])
print("Test Accuracy:", score[1])

Test Loss: 0.7344328165054321
Test Accuracy: 0.6727927923202515


In [None]:

Y_pred = model.predict(X_test)
Y_pred_cls = np.argmax(Y_pred, axis=1)



In [None]:

Y_test_cls = np.argmax(Y_test, axis=1)
sentiments = ['negative', 'positive', 'neutral']
Y_pred_labels = [sentiments[pred] for pred in Y_pred_cls]
Y_test_labels = [sentiments[true] for true in Y_test_cls]
print(classification_report(Y_test_labels, Y_pred_labels))

              precision    recall  f1-score   support

    negative       0.76      0.82      0.79      1722
     neutral       0.54      0.49      0.52       441
    positive       0.47      0.39      0.42       612

    accuracy                           0.67      2775
   macro avg       0.59      0.57      0.58      2775
weighted avg       0.66      0.67      0.66      2775



In [None]:

def analyze_user_text(model, tokenizer, maxlen, sentiments):
    user_text = input("Enter text for sentiment analysis: ")

    user_text = preprocess_text(user_text)

    user_text = tokenizer.texts_to_sequences([user_text])

    user_text = pad_sequences(user_text, maxlen=maxlen)

    prediction = model.predict(user_text)

    pred_class = np.argmax(prediction, axis=1)[0]

    sentiment_label = sentiments[pred_class]
    print("Predicted sentiment:", sentiment_label)

analyze_user_text(model, tokenizer, maxlen, sentiments)

Enter text for sentiment analysis: i am not sure i enjoyed the movie
Predicted sentiment: negative


In [None]:
model.save('my_model.h5')

  saving_api.save_model(
