# **Random Forest Model**

In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report

# Dataset: Using the publicly available "sentiment140" dataset published by Stanford University for sentiment analysis.

# Load the dataset
df = pd.read_csv('Sentiment140.tenPercent.sample.tweets.tsv', sep='\t', header=None)
df.columns = ['sentiment_label', 'tweet_text']

# Filtering out the row with 'sentiment_label'
df_cleaned = df[df['sentiment_label'] != 'sentiment_label']

# Splitting the cleaned data into train and test sets
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(df_cleaned['tweet_text'], 
                                                                            df_cleaned['sentiment_label'], 
                                                                            test_size=0.2, 
                                                                            random_state=42)

# Vectorize the cleaned data
vectorizer = CountVectorizer()
X_train_vectorized_clean = vectorizer.fit_transform(X_train_clean)
X_test_vectorized_clean = vectorizer.transform(X_test_clean)

# Random Forest Model:
# Ensemble learning method that fits a number of decision tree classifiers on various sub-samples of the dataset.
# Uses averaging to improve the predictive accuracy and control overfitting.
rf_model_cleaned = RandomForestClassifier(n_estimators=50, max_depth=20, min_samples_leaf=10, random_state=42)
rf_model_cleaned.fit(X_train_vectorized_clean, y_train_clean)

# Predicting on the cleaned test data
y_pred_clean = rf_model_cleaned.predict(X_test_vectorized_clean)

# Evaluating the model using the cleaned data
accuracy_clean = accuracy_score(y_test_clean, y_pred_clean)
report_clean = classification_report(y_test_clean, y_pred_clean, target_names=['Negative', 'Positive'], output_dict=True)

recall_clean = report_clean['macro avg']['recall']
precision_clean = report_clean['macro avg']['precision']
f1_clean = report_clean['macro avg']['f1-score']

print(f"Accuracy: {accuracy_clean*100:.2f}%")
print(f"Recall (macro avg): {recall_clean*100:.2f}%")
print(f"Precision (macro avg): {precision_clean*100:.2f}%")
print(f"F1 Score (macro avg): {f1_clean*100:.2f}%")

Accuracy: 71.28%
Recall (macro avg): 71.28%
Precision (macro avg): 71.60%
F1 Score (macro avg): 71.18%


# **Multi-layer Perceptron (MLP) model**

In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Recall, Precision
from tensorflow.keras.utils import to_categorical

# Dataset: Load and preprocess
df = pd.read_csv('Sentiment140.tenPercent.sample.tweets.tsv', sep='\t', header=None)
df.columns = ['sentiment_label', 'tweet_text']
df_cleaned = df[df['sentiment_label'] != 'sentiment_label']

# Splitting data
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(df_cleaned['tweet_text'], 
                                                                            df_cleaned['sentiment_label'], 
                                                                            test_size=0.2, 
                                                                            random_state=42)

# Tokenizing and padding
VOCAB_SIZE = 10000
EMBEDDING_DIM = 32
MAX_LENGTH = 100
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOKEN = '<OOV>'
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(X_train_clean)
train_sequences = tokenizer.texts_to_sequences(X_train_clean)
train_padded = pad_sequences(train_sequences, padding=PADDING_TYPE, truncating=TRUNC_TYPE, maxlen=MAX_LENGTH)
test_sequences = tokenizer.texts_to_sequences(X_test_clean)
test_padded = pad_sequences(test_sequences, padding=PADDING_TYPE, truncating=TRUNC_TYPE, maxlen=MAX_LENGTH)

# Ensure labels are of integer type and then replace 4 with 1
y_train_clean = y_train_clean.astype(int).replace({0:0, 4:1})
y_test_clean = y_test_clean.astype(int).replace({0:0, 4:1})

# Convert labels to one-hot encoded vectors
y_train_encoded = to_categorical(y_train_clean, num_classes=2)
y_test_encoded = to_categorical(y_test_clean, num_classes=2)

# Multi-layer Perceptron (MLP) Model:
mlp_model = Sequential([
    Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH),
    Flatten(),
    Dense(24, activation='relu'),
    Dropout(0.5),
    Dense(12, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

# Compiling with adjusted learning rate
optimizer = Adam(learning_rate=0.0001)
mlp_model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', Recall(name='recall'), Precision(name='precision')])

# Training with early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5)
history_mlp = mlp_model.fit(train_padded, y_train_encoded, epochs=20, validation_data=(test_padded, y_test_encoded), callbacks=[early_stop], verbose=2)

# Evaluating the model
mlp_scores = mlp_model.evaluate(test_padded, y_test_encoded, verbose=0)
print(f"Accuracy: {mlp_scores[1]*100:.2f}%")
print(f"Recall: {mlp_scores[2]*100:.2f}%")
print(f"Precision: {mlp_scores[3]*100:.2f}%")
print(f"F1 Score: {2*(mlp_scores[2]*mlp_scores[3])/(mlp_scores[2]+mlp_scores[3])*100:.2f}%")

Epoch 1/20
4000/4000 - 157s - loss: 0.6491 - accuracy: 0.6097 - recall: 0.6097 - precision: 0.6097 - val_loss: 0.5354 - val_accuracy: 0.7565 - val_recall: 0.7565 - val_precision: 0.7565 - 157s/epoch - 39ms/step
Epoch 2/20
4000/4000 - 149s - loss: 0.5286 - accuracy: 0.7571 - recall: 0.7571 - precision: 0.7571 - val_loss: 0.4742 - val_accuracy: 0.7824 - val_recall: 0.7824 - val_precision: 0.7824 - 149s/epoch - 37ms/step
Epoch 3/20
4000/4000 - 166s - loss: 0.4857 - accuracy: 0.7842 - recall: 0.7842 - precision: 0.7842 - val_loss: 0.4588 - val_accuracy: 0.7864 - val_recall: 0.7864 - val_precision: 0.7864 - 166s/epoch - 41ms/step
Epoch 4/20
4000/4000 - 133s - loss: 0.4616 - accuracy: 0.8022 - recall: 0.8022 - precision: 0.8022 - val_loss: 0.4548 - val_accuracy: 0.7863 - val_recall: 0.7863 - val_precision: 0.7863 - 133s/epoch - 33ms/step
Epoch 5/20
4000/4000 - 162s - loss: 0.4428 - accuracy: 0.8137 - recall: 0.8137 - precision: 0.8137 - val_loss: 0.4564 - val_accuracy: 0.7896 - val_recall: 0

# **Logistic Regression Model** 

In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv('Sentiment140.tenPercent.sample.tweets.tsv', sep='\t', header=None)
df.columns = ['sentiment_label', 'tweet_text']

# Filtering out the row with 'sentiment_label'
df_cleaned = df[df['sentiment_label'] != 'sentiment_label']

# Splitting the cleaned data into train and test sets
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(df_cleaned['tweet_text'], 
                                                                            df_cleaned['sentiment_label'], 
                                                                            test_size=0.2, 
                                                                            random_state=42)

# Vectorize the cleaned data
vectorizer = CountVectorizer()
X_train_vectorized_clean = vectorizer.fit_transform(X_train_clean)
X_test_vectorized_clean = vectorizer.transform(X_test_clean)

# Implementing the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train_vectorized_clean, y_train_clean)

# Predictions
y_pred_logistic = logistic_model.predict(X_test_vectorized_clean)

# Evaluation
accuracy_logistic = accuracy_score(y_test_clean, y_pred_logistic)
recall_logistic = recall_score(y_test_clean.astype(int).replace({0:0, 4:1}), (y_pred_logistic == '4').astype(int), average='macro')
f1_logistic = f1_score(y_test_clean.astype(int).replace({0:0, 4:1}), (y_pred_logistic == '4').astype(int), average='macro')

print("Logistic Regression Model Metrics (Retrained):")
print(f"Accuracy: {accuracy_logistic*100:.2f}%")
print(f"Recall: {recall_logistic*100:.2f}%")
print(f"F1 Score: {f1_logistic*100:.2f}%")

Logistic Regression Model Metrics (Retrained):
Accuracy: 77.99%
Recall: 77.99%
F1 Score: 77.99%


# **Support Vector Machine (SVM) model** 

In [5]:
from sklearn.svm import LinearSVC

# Implementing the SVM model
svm_model = LinearSVC(max_iter=10000, random_state=42)
svm_model.fit(X_train_vectorized_clean, y_train_clean)

# Predictions
y_pred_svm = svm_model.predict(X_test_vectorized_clean)

# Evaluation
accuracy_svm = accuracy_score(y_test_clean, y_pred_svm)
recall_svm = recall_score(y_test_clean.astype(int).replace({0:0, 4:1}), (y_pred_svm == '4').astype(int), average='macro')
f1_svm = f1_score(y_test_clean.astype(int).replace({0:0, 4:1}), (y_pred_svm == '4').astype(int), average='macro')

print("SVM Model Metrics:")
print(f"Accuracy: {accuracy_svm*100:.2f}%")
print(f"Recall: {recall_svm*100:.2f}%")
print(f"F1 Score: {f1_svm*100:.2f}%")

SVM Model Metrics:
Accuracy: 76.08%
Recall: 76.08%
F1 Score: 76.08%


# **Multinomial Naive Bayes model**

In [6]:
from sklearn.naive_bayes import MultinomialNB

# Implementing the Multinomial Naive Bayes model
mnb_model = MultinomialNB()
mnb_model.fit(X_train_vectorized_clean, y_train_clean)

# Predictions
y_pred_mnb = mnb_model.predict(X_test_vectorized_clean)

# Evaluation
accuracy_mnb = accuracy_score(y_test_clean, y_pred_mnb)
recall_mnb = recall_score(y_test_clean.astype(int).replace({0:0, 4:1}), (y_pred_mnb == '4').astype(int), average='macro')
f1_mnb = f1_score(y_test_clean.astype(int).replace({0:0, 4:1}), (y_pred_mnb == '4').astype(int), average='macro')

print("Multinomial Naive Bayes Model Metrics:")
print(f"Accuracy: {accuracy_mnb*100:.2f}%")
print(f"Recall: {recall_mnb*100:.2f}%")
print(f"F1 Score: {f1_mnb*100:.2f}%")

Multinomial Naive Bayes Model Metrics:
Accuracy: 76.62%
Recall: 76.62%
F1 Score: 76.56%
