In [1]:

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import seaborn as sns
import gensim.downloader as api
import plotly.graph_objs as go
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:

file_path = '/content/Sentiment140.csv'
# Load the CSV file
df_full = pd.read_csv(file_path, encoding='latin1', header=None)

In [None]:

# Download the 'punkt' resource
nltk.download('punkt')
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(tokens)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

# Rename the columns to appropriate names
df_full.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

# Convert the 'text' column to string type before preprocessing
df_full['text'] = df_full['text'].astype(str)


# Now apply the preprocessing function to the 'text' column
df_full['processed_text'] = df_full['text'].apply(preprocess_text)

# Display the first few rows to ensure the preprocessing worked
df_full.head()


Unnamed: 0,sentiment,id,date,query,user,text,processed_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http awww bummer shoulda got david ...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset ca update facebook texting might cry res...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dived many times ball managed save 50...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad ca see


In [None]:
# Model Building (LSTM-RNN example)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [None]:

#Randomize Sentiment column
df_full = df_full.sample(frac=1, random_state=42)

In [None]:
#Change 4 to 1
y = (df_full['sentiment'] == 4).astype(int)
print(y.value_counts())

sentiment
0    800000
1    800000
Name: count, dtype: int64


In [None]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_full['processed_text'])
sequences = tokenizer.texts_to_sequences(df_full['processed_text'])
X = pad_sequences(sequences)



In [None]:
# Build model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor='loss', patience=3)

In [None]:

# Train model
model.fit(X_train,y_train, callbacks=[callback], epochs=10, batch_size=32, validation_data=(X_test, y_test))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78778c176170>

In [None]:
model.save('./FirstRun.keras')

In [None]:
#model = keras.models.load_model('path/to/location.keras')

In [20]:

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred > 0.5))



              precision    recall  f1-score   support

           0       0.75      0.74      0.74    159879
           1       0.74      0.75      0.75    160121

    accuracy                           0.74    320000
   macro avg       0.74      0.74      0.74    320000
weighted avg       0.74      0.74      0.74    320000



In [21]:
# Application (simple example)
def predict_sentiment(text):
    processed = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([processed])
    padded = pad_sequences(sequence, maxlen=X.shape[1])
    prediction = model.predict(padded)[0][0]
    return "Most likely negative" if prediction > 0.5 else "Most likely positive"



In [23]:
# Test the applicationnow what to do."
test_text = "I hate this movie."
print(predict_sentiment(test_text))

Most likely positive


In [None]:
from google.colab import drive
drive.mount('/content/drive')