Importing the Libraries of all the Classifier Models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score

Loading the Merged Dataset

In [2]:
dataset = pd.read_csv("merged_sentiment_dataset.csv")

Converting the Sentiment Labels to Numeric Values

In [3]:
dataset['sentiment'] = dataset['sentiment'].map({'negative': 0, 'positive': 1})

Splitting the Dataset into Training Set and Test Set

In [4]:
train_data = dataset[dataset['dataset'] == 'train']
test_data = dataset[dataset['dataset'] == 'test']
X_train, y_train = train_data['text'], train_data['sentiment']
X_test, y_test = test_data['text'], test_data['sentiment']

Converting Text to Numerical Features using TF-IDF Vectorization

In [5]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Defining the Classification Models

In [6]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-NN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(kernel='linear'),
    "Kernel SVM": SVC(kernel='rbf'),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

Handling NaN values in y_train before model training

In [41]:
y_train = y_train.fillna(-1)
y_train = y_train.astype(int)

Check the unique values in the 'dataset' column

In [42]:
print(dataset['dataset'].unique())

['train' 'test']


Modify the filtering condition to ensure it captures 'train' values correctly

In [43]:
train_data = dataset[dataset['dataset'].str.contains('train', case=False, na=False)]

Check if train_data is empty and provide feedback

In [44]:
if train_data.empty:
    print("Warning: train_data is empty. Check your filtering condition.")
    print("Unique values in 'dataset' column:", dataset['dataset'].unique())
else:
    X_train, y_train = train_data['text'], train_data['sentiment']
    X_train_tfidf = vectorizer.transform(X_train)
    num_nan_before = y_train.isna().sum()
    if num_nan_before > 0:
        print(f"Warning: y_train contains {num_nan_before} NaN values. Consider imputation or a different handling strategy.")
        # Example: Imputation with the most frequent value
        # y_train = y_train.fillna(y_train.mode()[0])

    # Drop NaN values if necessary and cast to int
    y_train = y_train.dropna().astype(int)

    # Check if y_train is empty after dropping NaN
    if y_train.empty:
        print("Error: y_train is empty after dropping NaN values. Cannot proceed with training.")
    else:

Dropping Rows with NaN values in y_train

In [45]:
y_train = y_train.dropna()
y_train = y_train.astype(int)

Traning the Classifier Models and Evaluating the Accuracy, F1 Score, and Recall for all the Classifier Models

In [46]:
for name, model in models.items():
    if name == "Naive Bayes":
        X_train_tfidf_dense = X_train_tfidf.toarray()
        X_test_tfidf_dense = X_test_tfidf.toarray()
        model.fit(X_train_tfidf_dense, y_train)
        y_pred = model.predict(X_test_tfidf_dense)
    else:
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}\n")

ValueError: Found input variables with inconsistent numbers of samples: [732, 0]