In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

# =======================
# Data Preparation
# =======================

data_df = pd.read_csv('clean_data.csv')
data_df = data_df.sample(n=20000, random_state=1).reset_index(drop=True)
data_df['Rating'] = pd.to_numeric(data_df['Rating'], errors='coerce')
data_df = data_df.dropna(subset=['Rating'])
data_df['Rating'] = data_df['Rating'].astype(int)

# Slit between test and train.
train_df = data_df.iloc[:18000].reset_index(drop=True)
test_df = data_df.iloc[18000:].reset_index(drop=True)
for col in ['Title', 'Review']:
    train_df[col] = train_df[col].fillna('').astype(str)
    test_df[col] = test_df[col].fillna('').astype(str)
    
X_train = train_df[['Title', 'Review']]
y_train = train_df['Rating']
X_test = test_df[['Title', 'Review']]
y_test = test_df['Rating']

# =======================
# Pipeline Construction
# =======================

get_title = FunctionTransformer(lambda x: x['Title'], validate=False)
get_review = FunctionTransformer(lambda x: x['Review'], validate=False)
title_pipeline = Pipeline([
    ('selector', get_title),
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=5))
])
review_pipeline = Pipeline([
    ('selector', get_review),
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=5))
])
combined_features = FeatureUnion(
    transformer_list=[
        ('title', title_pipeline),
        ('review', review_pipeline)
    ],
    transformer_weights={'title': 0.2, 'review': 0.8}
)
pipeline = Pipeline([
    ('features', combined_features),
    ('svc', SVC())
])

# =======================
# Hyperparameter Tuning
# =======================
# Define the hyperparameter grid for the SVM.
param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto']
}

# Set up GridSearchCV with 3-fold cross-validation.
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=2, n_jobs=-1)

# Calculate the total number of fits for tracking progress.
n_candidates = len(param_grid['svc__C']) * len(param_grid['svc__kernel']) * len(param_grid['svc__gamma'])
total_fits = n_candidates * 3  # number of candidates * number of folds

# Use tqdm_joblib to track progress during grid search.
with tqdm_joblib(tqdm(total=total_fits, desc="GridSearchCV Progress")):
    grid_search.fit(X_train, y_train)

# =======================
# Evaluation on Test Set
# =======================
# Print cross-validation results for each candidate model.
print("Cross-validation Accuracy for each model:")
cv_results = grid_search.cv_results_
for i, params in enumerate(cv_results['params']):
    mean_accuracy = cv_results['mean_test_score'][i]
    print(f"Parameters: {params} - Accuracy: {mean_accuracy:.4f}")

# Display the best hyperparameters.
print("\nBest parameters:", grid_search.best_params_)

# Predict on the test set and print the evaluation metrics.
y_pred = grid_search.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


  from tqdm.autonotebook import tqdm
GridSearchCV Progress:   0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Cross-validation Accuracy for each model:
Parameters: {'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'} - Accuracy: 0.8164
Parameters: {'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'} - Accuracy: 0.7739
Parameters: {'svc__C': 0.1, 'svc__gamma': 'auto', 'svc__kernel': 'linear'} - Accuracy: 0.8164
Parameters: {'svc__C': 0.1, 'svc__gamma': 'auto', 'svc__kernel': 'rbf'} - Accuracy: 0.5106
Parameters: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'} - Accuracy: 0.8381
Parameters: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'} - Accuracy: 0.8395
Parameters: {'svc__C': 1, 'svc__gamma': 'auto', 'svc__kernel': 'linear'} - Accuracy: 0.8381
Parameters: {'svc__C': 1, 'svc__gamma': 'auto', 'svc__kernel': 'rbf'} - Accuracy: 0.5106
Parameters: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'linear'} - Accuracy: 0.8094
Parameters: {'svc__C': 10, 'svc__gamma': 'scale', 'svc__kernel': 'r