In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
df=pd.read_csv('/content/drive/MyDrive/Data.csv', encoding='latin1')
df=df.drop_duplicates()
df=df.dropna()
df.shape

(27480, 10)

In [4]:
df.head(2)

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105


In [5]:
# Specify the column containing text data
message_column = 'selected_text'
# Check and convert non-string elements to strings
df[message_column] = df[message_column].astype(str)
# Lowercasing and removing special characters
df[message_column] = df[message_column].str.lower()
df[message_column] = df[message_column].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

nltk.download('stopwords')

def remove_stopwords(text):
    stopwords_list = stopwords.words("english")
    tokens = text.split()
    clean_tokens = [token for token in tokens if token not in stopwords_list]
    return " ".join(clean_tokens)

# Apply remove_stopwords to the 'text' column
df[message_column] = df[message_column].apply(remove_stopwords)



# Display a few values after preprocessing
print("\nData After Preprocessing:")
#print(df.head())
df[message_column]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



Data After Preprocessing:


0                        id responded going
1                                  sooo sad
2                                  bullying
3                               leave alone
4                                      sons
                        ...                
27476                                  lost
27477                            dont force
27478                              yay good
27479                                 worth
27480    flirting going atg smiles yay hugs
Name: selected_text, Length: 27480, dtype: object

In [6]:
df['sentiment'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [7]:
a=df[df['sentiment'] == 'positive']
df1=pd.DataFrame(a)
print('Positive samples: ',a.shape)
b=df[df['sentiment'] == 'negative']
df2=pd.DataFrame(b)
print('Negative samples: ',b.shape)
c=df[df['sentiment'] == 'neutral']
df3=pd.DataFrame(c)
print('Neutral samples: ',c.shape)

Positive samples:  (8582, 10)
Negative samples:  (7781, 10)
Neutral samples:  (11117, 10)


In [8]:
df1=df1[:7780]
df2=df2[:7780]
df3=df3[:7780]
df = pd.concat([df1, df2,df3])
df.shape

(23340, 10)

In [9]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable y
y_encoded = label_encoder.fit_transform(df['sentiment'])
# Display a few values of y (encoded)
print("\nEncoded Target Variable (y):")
#print(y_encoded[:5])
y_encoded


Encoded Target Variable (y):


array([2, 2, 2, ..., 1, 1, 1])

In [10]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)  # Adjust num_words as needed
tokenizer.fit_on_texts(df[message_column])
# Convert text to sequences
X_sequences = tokenizer.texts_to_sequences(df[message_column])
# Display a few Sequences
print("\nDisplay few values of sequences:")
print(X_sequences[:5])
# Pad sequences to a fixed length (adjust maxlen as needed)
X_padded = pad_sequences(X_sequences, maxlen=100)  # maxlen is the maximum sequence length
# Display a few Pad Sequences
print("\nDisplay few values of pad sequences:")
X_padded



Display few values of sequences:
[[26], [128, 21, 2339, 2340], [7], [321], [300, 5731, 1136, 541, 26, 1, 1522]]

Display few values of pad sequences:


array([[   0,    0,    0, ...,    0,    0,   26],
       [   0,    0,    0, ...,   21, 2339, 2340],
       [   0,    0,    0, ...,    0,    0,    7],
       ...,
       [   0,    0,    0, ...,    0,    0,  157],
       [   0,    0,    0, ...,    0,    0,   52],
       [   0,    0,    0, ...,    1,  609, 4211]], dtype=int32)

In [11]:
X_padded.shape

(23340, 100)

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)
# Display a few values of X_train
print("\nFew Values Of X_train :")
print('train X shape',X_train.shape)
# Display a few values of X_test
print("\nFew Values Of X_test :")
print('Test X shape',X_test.shape)
# Display a few values of y_train
print("\nFew Values Of y_train :")
print('Train Y shape',y_train.shape)
# Display a few values of y_test
print("\nFew Values Of y_test:")
print('Test Y shape',y_test.shape)


Few Values Of X_train :
train X shape (18672, 100)

Few Values Of X_test :
Test X shape (4668, 100)

Few Values Of y_train :
Train Y shape (18672,)

Few Values Of y_test:
Test Y shape (4668,)


In [13]:
X_train

array([[   0,    0,    0, ...,    0,    0,  675],
       [   0,    0,    0, ...,   36,  311, 2579],
       [   0,    0,    0, ...,    0,    0, 1617],
       ...,
       [   0,    0,    0, ...,    0,    0,   24],
       [   0,    0,    0, ...,   32,    8,   57],
       [   0,    0,    0, ...,    0, 2276, 4810]], dtype=int32)

In [74]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=18672, output_dim=100, input_length=100))  # Adjust input_dim, output_dim, and input_length
model.add(SimpleRNN(units=128))  # Adjust units as needed
model.add(Dense(units=3, activation='softmax'))  # 3 classes for sentiment, adjust as needed

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)  # Adjust epochs and batch_size


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f99c55a3550>

In [75]:
from keras.layers import LSTM
model = Sequential()
model.add(Embedding(input_dim=23340, output_dim=100, input_length=100))
model.add(LSTM(128,return_sequences=True))
model.add(LSTM(8))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop',loss='sparse_categorical_crossentropy',metrics=['acc'])
history = model.fit(X_train, y_train,epochs=10,batch_size=64,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')


In [None]:
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred = model.predict(X_test)

# Convert predicted probabilities to class labels
y_pred_labels = np.argmax(y_pred, axis=1)

# Generate a classification report
report = classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_)
print("Classification Report:")
print(report)


In [None]:
import pickle
with open('model_Sentiment_LSTM.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [83]:
from keras.layers import LSTM
from keras.layers import Bidirectional
model = Sequential()
model.add(Embedding(input_dim=18672, output_dim=100, input_length=100))
model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(LSTM(8))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop',loss='sparse_categorical_crossentropy',metrics=['acc'])
history = model.fit(X_train, y_train,epochs=10,batch_size=64,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
