In [27]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import FunctionTransformer
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

# Read the training CSV file, skipping the header row.
train_df = pd.read_csv('clean_train.csv', skiprows=1, names=['Rating', 'Title', 'Review'])
# Randomly select 500 rows for training.
train_df = train_df.sample(n=500).reset_index(drop=True)

# Read the test CSV file, skipping the header row.
test_df = pd.read_csv('clean_test.csv', skiprows=1, names=['Rating', 'Title', 'Review'])
# Randomly select 1000 rows for testing.
test_df = test_df.sample(n=1000).reset_index(drop=True)

# Ensure that the Title and Review columns are strings and fill NaN values.
for col in ['Title', 'Review']:
    train_df[col] = train_df[col].fillna('').astype(str)
    test_df[col] = test_df[col].fillna('').astype(str)

# X will be a DataFrame with two columns: Title and Review.
X_train = train_df[['Title', 'Review']]
y_train = train_df['Rating'].astype(int)  # ensure numeric labels

X_test = test_df[['Title', 'Review']]
y_test = test_df['Rating'].astype(int)

# Create column selectors using FunctionTransformer.
get_title = FunctionTransformer(lambda x: x['Title'], validate=False)
get_review = FunctionTransformer(lambda x: x['Review'], validate=False)

# Build separate pipelines for processing Title and Review.
title_pipeline = Pipeline([
    ('selector', get_title),
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=5))
])
review_pipeline = Pipeline([
    ('selector', get_review),
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=5))
])

# Combine the pipelines with FeatureUnion, applying different weights.
combined_features = FeatureUnion(
    transformer_list=[
        ('title', title_pipeline),
        ('review', review_pipeline)
    ],
    transformer_weights={'title': 0.2, 'review': 0.8}  # adjust weights as needed
)

# Create the final pipeline with the combined features and an SVM classifier.
pipeline = Pipeline([
    ('features', combined_features),
    ('svc', SVC())
])

# Define a parameter grid for tuning the SVM hyperparameters.
param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto']
}

# Set up GridSearchCV with 3-fold cross-validation.
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=2, n_jobs=-1)

# Calculate the total number of fits for the progress bar:
# (number of candidate parameter combinations * number of folds)
n_candidates = len(param_grid['svc__C']) * len(param_grid['svc__kernel']) * len(param_grid['svc__gamma'])
total_fits = n_candidates * 3

# Use tqdm_joblib to track progress during grid search.
with tqdm_joblib(tqdm(total=total_fits, desc="GridSearchCV Progress")):
    grid_search.fit(X_train, y_train)

# Print the best hyperparameters found.
print("Best parameters:", grid_search.best_params_)

# Predict on the test set and evaluate the performance.
y_pred = grid_search.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))











[A[A[A[A[A[A[A[A[A

  0%|          | 0/36 [00:00<?, ?it/s]

Fitting 3 folds for each of 12 candidates, totalling 36 fits


GridSearchCV Progress:   0%|          | 0/36 [34:33<?, ?it/s]

Best parameters: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}
Test Accuracy: 0.732
              precision    recall  f1-score   support

           0       0.75      0.74      0.74       528
           1       0.71      0.72      0.72       472

    accuracy                           0.73      1000
   macro avg       0.73      0.73      0.73      1000
weighted avg       0.73      0.73      0.73      1000




