In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import FunctionTransformer

# Read the training CSV file, skipping the header row.
train_df = pd.read_csv('clean_train.csv', skiprows=1, names=['Rating', 'Title', 'Review'])
test_df = pd.read_csv('clean_test.csv', skiprows=1, names=['Rating', 'Title', 'Review'])
# Randomly Selects a Sample of 500 from each 
train_df = train_df.sample(n=500).reset_index(drop=True)
test_df = test_df.sample(n=500).reset_index(drop=True)

# Ensure that the Title and Review columns are strings and fill NaN values.
for col in ['Title', 'Review']:
    train_df[col] = train_df[col].fillna('').astype(str)
    test_df[col] = test_df[col].fillna('').astype(str)

# X will be a DataFrame with two columns: Title and Review.
X_train = train_df[['Title', 'Review']]
y_train = train_df['Rating'].astype(int)  # ensure numeric labels

X_test = test_df[['Title', 'Review']]
y_test = test_df['Rating'].astype(int)

# Create column selectors using FunctionTransformer.
get_title = FunctionTransformer(lambda x: x['Title'], validate=False)
get_review = FunctionTransformer(lambda x: x['Review'], validate=False)

# Build separate pipelines for processing Title and Review.
title_pipeline = Pipeline([
    ('selector', get_title),
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=5))
])
review_pipeline = Pipeline([
    ('selector', get_review),
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=5))
])

# Combine the pipelines with FeatureUnion, applying different weights.
combined_features = FeatureUnion(
    transformer_list=[
        ('title', title_pipeline),
        ('review', review_pipeline)
    ],
    transformer_weights={'title': 0.2, 'review': 0.8}  # adjust weights as needed
)

# Create the final pipeline with the combined features and an SVM classifier.
pipeline = Pipeline([
    ('features', combined_features),
    ('svc', SVC())  # Using default hyperparameters
])

# Train the pipeline on the training data.
pipeline.fit(X_train, y_train)

# Predict on the test set and evaluate the performance.
y_pred = pipeline.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Test Accuracy: 0.746
              precision    recall  f1-score   support

           0       0.76      0.72      0.74       507
           1       0.73      0.77      0.75       493

    accuracy                           0.75      1000
   macro avg       0.75      0.75      0.75      1000
weighted avg       0.75      0.75      0.75      1000

