In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras_tuner import RandomSearch
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from keras.layers import MaxPooling1D
from keras.models import save_model, load_model

In [2]:
# Load the datasets
train_df = pd.read_csv('../Resources/train_2.csv', encoding='ISO-8859-1')
test_df = pd.read_csv('../Resources/test_2.csv', encoding='ISO-8859-1')


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   text              27480 non-null  object 
 1   selected_text     27480 non-null  object 
 2   Time of Tweet     27481 non-null  object 
 3   Age of User       27481 non-null  object 
 4   Country           27481 non-null  object 
 5   Population -2020  27481 non-null  int64  
 6   Land Area (Km²)   27481 non-null  float64
 7   Density (P/Km²)   27481 non-null  int64  
 8   sentiment         27481 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1.9+ MB


In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3534 entries, 0 to 3533
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   text              3534 non-null   object 
 1   Time of Tweet     3534 non-null   object 
 2   Age of User       3534 non-null   object 
 3   Country           3534 non-null   object 
 4   Population -2020  3534 non-null   int64  
 5   Land Area (Km²)   3534 non-null   float64
 6   Density (P/Km²)   3534 non-null   int64  
 7   sentiment         3534 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 221.0+ KB


In [5]:
# Assuming 'text' is the feature column and 'sentiment' is the target
X_train, y_train = train_df['text'].fillna(''), train_df['sentiment']
X_test, y_test = test_df['text'].fillna(''), test_df['sentiment']

In [6]:
# Instantiate the encoder
label_encoder = LabelEncoder()

In [7]:
# Fit the encoder and transform the target columns to numerical labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [8]:
# Preprocess and Vectorize text data for the logistic regression model
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

In [9]:
# Scale the TF-IDF features
scaler = StandardScaler(with_mean=False)  # Set with_mean=False for sparse data compatibility
X_train_tfidf_scaled = scaler.fit_transform(X_train_tfidf)
X_test_tfidf_scaled = scaler.transform(X_test_tfidf)

In [10]:
# Train a Logistic Regression model with the scaled data
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf_scaled, y_train_encoded) 
lr_predictions = lr_model.predict(X_test_tfidf_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test_encoded, lr_predictions))

Logistic Regression Accuracy: 0.6581777023203169


In [11]:
# confusuion matrix for the logistic regression model
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_encoded, lr_predictions)


array([[650, 288,  63],
       [271, 930, 229],
       [ 75, 282, 746]], dtype=int64)

In [12]:
# Precision, Recall, and F1-Score for the logistic regression model
from sklearn.metrics import classification_report
print(classification_report(y_test_encoded, lr_predictions))


              precision    recall  f1-score   support

           0       0.65      0.65      0.65      1001
           1       0.62      0.65      0.63      1430
           2       0.72      0.68      0.70      1103

    accuracy                           0.66      3534
   macro avg       0.66      0.66      0.66      3534
weighted avg       0.66      0.66      0.66      3534



In [13]:
# Save the trained Logistic Regression model
#lr_model.save('logistic_regression_model.keras')
joblib.dump(lr_model, 'logistic_regression_model.joblib')

['logistic_regression_model.joblib']

In [14]:
# Train a Random Forest model with the scaled data
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train_tfidf_scaled, y_train_encoded)
rf_predictions = rf_model.predict(X_test_tfidf_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test_encoded, rf_predictions))

 

Random Forest Accuracy: 0.6870401810979061


In [15]:
# confusion matrix for the random forest model
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_encoded, rf_predictions)


array([[ 561,  395,   45],
       [ 168, 1092,  170],
       [  42,  286,  775]], dtype=int64)

In [16]:
# Precision, Recall, and F1-Score for the random forest model
from sklearn.metrics import classification_report
print(classification_report(y_test_encoded, rf_predictions))

              precision    recall  f1-score   support

           0       0.73      0.56      0.63      1001
           1       0.62      0.76      0.68      1430
           2       0.78      0.70      0.74      1103

    accuracy                           0.69      3534
   macro avg       0.71      0.68      0.69      3534
weighted avg       0.70      0.69      0.69      3534



In [17]:
# Save the trained Random Forest model
#lr_model.save('random_forest_model.keras')
joblib.dump(rf_model, 'random_forest_model.joblib')

['random_forest_model.joblib']

In [18]:
# Train a Naive Bayes model with the scaled data
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf_scaled, y_train_encoded)
nb_predictions = nb_model.predict(X_test_tfidf_scaled)
print("Naive Bayes Accuracy:", accuracy_score(y_test_encoded, nb_predictions))

Naive Bayes Accuracy: 0.5721561969439728


In [25]:
# confusion matrix for the Navie Bayes model
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_encoded, nb_predictions)

array([[617, 267, 117],
       [366, 705, 359],
       [125, 278, 700]], dtype=int64)

In [26]:
# Precision, Recall, and F1-Score for the Navie Bayes model
from sklearn.metrics import classification_report
print(classification_report(y_test_encoded, rf_predictions))

              precision    recall  f1-score   support

           0       0.73      0.56      0.63      1001
           1       0.62      0.76      0.68      1430
           2       0.78      0.70      0.74      1103

    accuracy                           0.69      3534
   macro avg       0.71      0.68      0.69      3534
weighted avg       0.70      0.69      0.69      3534



In [19]:
# Save the trained Naive Bayes model
#lr_model.save('naive_bayes_model.keras')
joblib.dump(nb_model, 'naive_bayes_model.joblib')

['naive_bayes_model.joblib']

In [27]:
# Prepare data for CNN
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
maxlen = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

In [28]:
# Define a model-building function for the tuner
def build_model(hp):
    model = Sequential([
    Embedding(input_dim=5000, output_dim=50),
    Conv1D(
        filters=hp.Int('filters', min_value=32, max_value=128, step=32),
        kernel_size=hp.Choice('kernel_size', values=[3, 5, 7]),
        activation='relu'),
    MaxPooling1D(pool_size=2),
    Conv1D(
        filters=64,
        kernel_size=3,
        activation='relu'
    ),
    MaxPooling1D(pool_size=2),
    Conv1D(
        filters=128,
        kernel_size=3,
        activation='relu'
    ),
    GlobalMaxPooling1D(),
    Dense(units=hp.Int('dense_units', min_value=10, max_value=100, step=10), activation='relu'),
    Dense(1, activation='sigmoid')
])

    
    model.compile(optimizer=hp.Choice('optimizer', ['adam']),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [29]:
# Initialize the tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=1,
    directory='my_dir',
    project_name='sentiment_analysis'
)

Reloading Tuner from my_dir\sentiment_analysis\tuner0.json


In [30]:
# Execute the search with encoded labels
tuner.search(X_train_pad, y_train_encoded, epochs=5, validation_split=0.1)

In [31]:
# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

NotFoundError: NewRandomAccessFile failed to Create/Open: my_dir\sentiment_analysis\trial_3\build_config.json : The system cannot find the file specified.
; No such file or directory

In [None]:
# Save the best CNN model
best_model.save('best_cnn_model.keras')

In [None]:
# Evaluate the best model with encoded labels
_, accuracy = best_model.evaluate(X_test_pad, y_test_encoded)
print("CNN with Tuner Accuracy:", accuracy)

CNN with Tuner Accuracy: 0.40464064478874207
