In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

# =======================
# Data Preparation
# =======================

data_df = pd.read_csv('clean_data.csv')
data_df = data_df.sample(n=20000, random_state=1).reset_index(drop=True)
data_df['Rating'] = pd.to_numeric(data_df['Rating'], errors='coerce')
data_df = data_df.dropna(subset=['Rating'])
data_df['Rating'] = data_df['Rating'].astype(int)

# Split between test and train.
train_df = data_df.iloc[:18000].reset_index(drop=True)
test_df = data_df.iloc[18000:].reset_index(drop=True)
for col in ['Title', 'Review']:
    train_df[col] = train_df[col].fillna('').astype(str)
    test_df[col] = test_df[col].fillna('').astype(str)
    
X_train = train_df[['Title', 'Review']]
y_train = train_df['Rating']
X_test = test_df[['Title', 'Review']]
y_test = test_df['Rating']

# =======================
# Pipeline Construction
# =======================

get_title = FunctionTransformer(lambda x: x['Title'], validate=False)
get_review = FunctionTransformer(lambda x: x['Review'], validate=False)

title_pipeline = Pipeline([
    ('selector', get_title),
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=5))
])

review_pipeline = Pipeline([
    ('selector', get_review),
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=5))
])

# Create the FeatureUnion with default transformer weights (which will be overwritten by grid search)
combined_features = FeatureUnion(
    transformer_list=[
        ('title', title_pipeline),
        ('review', review_pipeline)
    ],
    transformer_weights={'title': 0.2, 'review': 0.8}
)

pipeline = Pipeline([
    ('features', combined_features),
    ('svc', SVC())
])

# =======================
# Hyperparameter Tuning
# =======================

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto'],
    'features__transformer_weights': [
        {'title': 0.1, 'review': 0.9},
        {'title': 0.3, 'review': 0.7},
        {'title': 0.5, 'review': 0.5},
        {'title': 0.7, 'review': 0.3},
        {'title': 0.9, 'review': 0.1}
    ]
}

n_candidates = (
    len(param_grid['svc__C']) *
    len(param_grid['svc__kernel']) *
    len(param_grid['svc__gamma']) *
    len(param_grid['features__transformer_weights'])
)

total_fits = n_candidates * 3  
#GridSearchCV with 3-fold cross-validation.
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=2, n_jobs=-1)
with tqdm_joblib(tqdm(total=total_fits, desc="GridSearchCV Progress")):
    grid_search.fit(X_train, y_train)

# =======================
# Evaluation on Test Set
# =======================
# Print cross-validation results for each candidate model.
print("Cross-validation Accuracy for each model:")
cv_results = grid_search.cv_results_
for i, params in enumerate(cv_results['params']):
    mean_accuracy = cv_results['mean_test_score'][i]
    print(f"Parameters: {params} - Accuracy: {mean_accuracy:.4f}")
print("\nBest parameters:", grid_search.best_params_)


y_pred = grid_search.predict(X_test) #Evaluation metrics
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


GridSearchCV Progress:   0%|          | 0/180 [00:00<?, ?it/s]

  0%|          | 0/180 [00:00<?, ?it/s]

Fitting 3 folds for each of 60 candidates, totalling 180 fits
Cross-validation Accuracy for each model:
Parameters: {'features__transformer_weights': {'title': 0.1, 'review': 0.9}, 'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'} - Accuracy: 0.8136
Parameters: {'features__transformer_weights': {'title': 0.1, 'review': 0.9}, 'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'} - Accuracy: 0.7624
Parameters: {'features__transformer_weights': {'title': 0.1, 'review': 0.9}, 'svc__C': 0.1, 'svc__gamma': 'auto', 'svc__kernel': 'linear'} - Accuracy: 0.8136
Parameters: {'features__transformer_weights': {'title': 0.1, 'review': 0.9}, 'svc__C': 0.1, 'svc__gamma': 'auto', 'svc__kernel': 'rbf'} - Accuracy: 0.5106
Parameters: {'features__transformer_weights': {'title': 0.1, 'review': 0.9}, 'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'} - Accuracy: 0.8241
Parameters: {'features__transformer_weights': {'title': 0.1, 'review': 0.9}, 'svc__C': 1, 'svc__gamma': 'scale'