In [3]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
# Step 1: Load datasets
train_df = pd.read_csv('train.csv', encoding='latin1')
test_df = pd.read_csv('test.csv', encoding='latin1')

In [5]:
# Step 2: Preprocessing
# Fill missing values
train_df['text'] = train_df['text'].fillna('')
test_df['text'] = test_df['text'].fillna('')


In [6]:
# Encode target variable
train_df['sentiment_encoded'] = train_df['sentiment'].map({'neutral': 0, 'positive': 1, 'negative': -1})
test_df['sentiment_encoded'] = test_df['sentiment'].map({'neutral': 0, 'positive': 1, 'negative': -1})


In [7]:
# Simple text cleaning function
def preprocess_text(text):
    # Lowercase text
    text = text.lower()
    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [8]:
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

In [9]:
# Step 3: Feature transformation using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(train_df['text'])
X_test_vec = vectorizer.transform(test_df['text'])

y_train = train_df['sentiment_encoded']
y_test = test_df['sentiment_encoded']

In [16]:
# Check for missing values in the target column
print("Missing values in y_test:", y_test.isna().sum())

# Drop rows with missing target values
if y_test.isna().sum() > 0:
    print("Dropping rows with missing target values...")
    valid_indices = ~y_test.isna()
    X_test_vec = X_test_vec[valid_indices]
    y_test = y_test[valid_indices]

# Ensure no NaN values remain
assert not y_test.isna().any(), "y_test still contains NaN values!"


Missing values in y_test: 1281
Dropping rows with missing target values...


In [17]:
# Step 4: Hyperparameter tuning for Logistic Regression
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2']
}

grid_search_lr = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=param_grid_lr,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_search_lr.fit(X_train_vec, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [18]:
# Best model from Logistic Regression
best_lr_model = grid_search_lr.best_estimator_
y_train_pred_lr = best_lr_model.predict(X_train_vec)
y_test_pred_lr = best_lr_model.predict(X_test_vec)

train_acc_lr = accuracy_score(y_train, y_train_pred_lr)
test_acc_lr = accuracy_score(y_test, y_test_pred_lr)

print("Logistic Regression Best Parameters:", grid_search_lr.best_params_)
print("Logistic Regression Training Accuracy:", train_acc_lr)
print("Logistic Regression Test Accuracy:", test_acc_lr)


Logistic Regression Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Logistic Regression Training Accuracy: 0.7555401914049708
Logistic Regression Test Accuracy: 0.6997736276174307


In [19]:
# Step 5: Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search_rf = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid_rf,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_search_rf.fit(X_train_vec, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


KeyboardInterrupt: 

In [20]:



# Best model from Random Forest
best_rf_model = grid_search_rf.best_estimator_
y_train_pred_rf = best_rf_model.predict(X_train_vec)
y_test_pred_rf = best_rf_model.predict(X_test_vec)

train_acc_rf = accuracy_score(y_train, y_train_pred_rf)
test_acc_rf = accuracy_score(y_test, y_test_pred_rf)

print("Random Forest Best Parameters:", grid_search_rf.best_params_)
print("Random Forest Training Accuracy:", train_acc_rf)
print("Random Forest Test Accuracy:", test_acc_rf)

# Step 6: Cross-validation for the best Random Forest model
cross_val_scores_rf = cross_val_score(best_rf_model, X_train_vec, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy for Random Forest:", cross_val_scores_rf.mean())

# Step 7: Visualize Feature Importance (Random Forest)
feature_importances = pd.DataFrame({
    'Feature': vectorizer.get_feature_names_out(),
    'Importance': best_rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False).head(20)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances, palette='viridis')
plt.title("Top 20 Features by Importance (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# Final Results
print("\nLogistic Regression Classification Report (Test Data):")
print(classification_report(y_test, y_test_pred_lr))

print("\nRandom Forest Classification Report (Test Data):")
print(classification_report(y_test, y_test_pred_rf))


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'