In [1]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
data = pd.read_csv("tweet-sentiment-extraction/train.csv")

# Display the first few rows of the dataset
print(data.head())
print(data.info())
print(data["sentiment"].value_counts())


# Data Preprocessing
def preprocess_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


data["Cleaned_Text"] = data["text"].apply(lambda x: preprocess_text(x))
# data["text"] = data["text"].apply(preprocess_text)

# Encode the sentiment labels
label_mapping = {"negative": 0, "neutral": 1, "positive": 2}
data["sentiment"] = data["sentiment"].map(label_mapping)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    data["Cleaned_Text"], data["sentiment"], test_size=0.2, random_state=42
)

# Feature Extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


# Machine Learning Models
def train_evaluate_model(model, param_grid, X_train, X_test, y_train, y_test):
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring="accuracy")
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred, target_names=label_mapping.keys()))


# Logistic Regression
print("Logistic Regression")
log_reg = LogisticRegression()
log_reg_params = {"C": [0.1, 1, 10], "solver": ["newton-cg", "lbfgs", "liblinear"]}
train_evaluate_model(
    log_reg, log_reg_params, X_train_tfidf, X_test_tfidf, y_train, y_test
)

# Support Vector Machine
print("Support Vector Machine")
svm = SVC()
svm_params = {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
train_evaluate_model(svm, svm_params, X_train_tfidf, X_test_tfidf, y_train, y_test)

# Random Forest
print("Random Forest")
rf = RandomForestClassifier()
rf_params = {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20]}
train_evaluate_model(rf, rf_params, X_train_tfidf, X_test_tfidf, y_train, y_test)

# Deep Learning Models
# Tokenization and Padding for Deep Learning
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

# Simple Neural Network
print("Simple Neural Network")
model_nn = Sequential(
    [
        Embedding(input_dim=5000, output_dim=64, input_length=100),
        GlobalAveragePooling1D(),
        Dense(64, activation="relu"),
        Dense(3, activation="softmax"),
    ]
)

model_nn.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
model_nn.fit(
    X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test)
)

loss, accuracy = model_nn.evaluate(X_test_pad, y_test)
print(f"Accuracy: {accuracy}")

# LSTM Model
print("LSTM Model")
model_lstm = Sequential(
    [
        Embedding(input_dim=5000, output_dim=64, input_length=100),
        LSTM(64),
        Dense(3, activation="softmax"),
    ]
)

model_lstm.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
model_lstm.fit(
    X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test)
)

loss, accuracy = model_lstm.evaluate(X_test_pad, y_test)
print(f"Accuracy: {accuracy}")

  from pandas.core import (


       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negative  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'C': 1, 'solver': 'lbfgs'}
Accuracy: 0.694742586865563
              precision    recall  f1-score   support

    negative       0.73      0.61      0.67      1562
     neutral       0.63      0.75      0.68      2230
    positive       0.77      0.70      0.74      1705

    accuracy                           0.69      5497
   macro avg       0.71      0.69      0.70      5497
weighted avg       0.70      0.69      0.70      5497

Support Vector Machine
Best Parameters: {'C': 1, 'kernel': 'linear'}
Accuracy: 0.7020192832454066
              precision    recall  f1-score   support

    negative       0.73      0.61      0.67      1562
     neutral       0.63      0.77      0.70      2230
    positive       0.80      0.70      0.74      1705

    accuracy                           0.70      5497
   macro avg       0.72      0.69      0.70      5497
weighted avg       0.71      0.70      0.70      5497

Random Forest
Best Parameters: {'max_depth': None, 'n_estimators': 

2024-07-19 18:47:23.962446: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-07-19 18:47:23.962573: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-07-19 18:47:23.962584: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-07-19 18:47:23.963143: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-07-19 18:47:23.963622: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/5
Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x1484c2980>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x1484c2980>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


: 