<a href="https://colab.research.google.com/github/AshvinVignesh/Final_year/blob/main/final_year_SVM_and_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install deap

Collecting deap
  Downloading deap-1.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading deap-1.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deap
Successfully installed deap-1.4.2


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report , f1_score
from sklearn.model_selection import StratifiedKFold , train_test_split ,GridSearchCV,cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# from imblearn.pipeline import Pipeline
#from deap import base, creator, tools, algorithms
import random

In [None]:
df = pd.read_csv("fyp_data.csv")

In [None]:
X = df["clean_text"]
y = df[['Acting', 'direction', 'Music',  'Genre', 'excitement','ovr_sent']] # Use a list of column names to select multiple columns

In [None]:
C_RANGE = [0.1, 1, 10, 100]
KERNEL_RANGE = ['linear', 'poly', 'rbf', 'sigmoid']
GAMMA_RANGE = ['scale', 'auto', 0.1, 1, 10]
DEGREE_RANGE = [2, 3, 4, 5]  # Only for 'poly' kernel

In [None]:
def custom_mutation(individual, indpb):
    for i in range(len(individual)):
        if random.random() < indpb:
            if i == 0:  # C
                individual[i] = random.choice(C_RANGE)
            elif i == 1:  # kernel
                individual[i] = random.choice(KERNEL_RANGE)
            elif i == 2:  # gamma
                individual[i] = random.choice(GAMMA_RANGE)
            elif i == 3:  # degree (only if kernel is 'poly')
                if individual[1] == 'poly':
                    individual[i] = random.choice(DEGREE_RANGE)
    return individual,

In [None]:
def prepare_data(X, y_df):
    # Convert text to TF-IDF features
    # Handle missing values by replacing them with an empty string
    X = X.fillna('')
    vectorizer = TfidfVectorizer(max_features=1000)
    X_tfidf = vectorizer.fit_transform(X)

    # Prepare label encoders for each target
    label_encoders = {}
    y_encoded = pd.DataFrame()

    # Encode each target column
    for column in y_df.columns:
        le = LabelEncoder()
        y_encoded[column] = le.fit_transform(y_df[column].astype(str))
        label_encoders[column] = le

    return X_tfidf, y_encoded, label_encoders

def custom_f1_score(y_true, y_pred):
    return f1_score(y_true, y_pred, average='micro')

In [None]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

In [None]:
def create_toolbox(X, y_column):
    toolbox = base.Toolbox()

    # Define genes
    toolbox.register("C", random.choice, C_RANGE)
    toolbox.register("kernel", random.choice, KERNEL_RANGE)
    toolbox.register("gamma", random.choice, GAMMA_RANGE)
    toolbox.register("degree", random.choice, DEGREE_RANGE)

    # Create individual and population
    # Register the hyperparameters as attributes within the toolbox
    toolbox.register("individual", tools.initCycle, creator.Individual,
                     (toolbox.C, toolbox.kernel, toolbox.gamma, toolbox.degree), n=1)

    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    # Define evaluation function
    def evaluate(individual):
      try:
          svc = SVC(
              C=individual[0],
              kernel=individual[1],
              gamma=individual[2],
              degree=individual[3] if individual[1] == 'poly' else 3,  # Default degree if not 'poly'
              random_state=42
          )
          scores = cross_val_score(svc, X, y_column, cv=3, scoring='f1_weighted')
          return scores.mean(),
      except Exception as e:
          print(f"Error with parameters: {individual}")
          print(f"Error message: {str(e)}")
          return 0.0,

    toolbox.register("evaluate", evaluate)

    # Genetic operators
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", custom_mutation, indpb=0.2)
    toolbox.register("select", tools.selTournament, tournsize=3)

    return toolbox

In [None]:
def run_genetic_optimization(X, y_column, population_size=50, generations=50):
    toolbox = create_toolbox(X, y_column)

    # Create initial population
    population = toolbox.population(n=population_size)

    # Statistics setup
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    # Run the algorithm
    final_pop, logbook = algorithms.eaSimple(population, toolbox,
                                           cxpb=0.7,
                                           mutpb=0.2,
                                           ngen=generations,
                                           stats=stats,
                                           verbose=True)

    # Get best solution
    best_solution = tools.selBest(final_pop, k=1)[0]

    # Convert to dictionary of parameters
    best_params = {
        'C': best_solution[0],
        'kernel': best_solution[1],
        'gamma': best_solution[2],
        'degree': best_solution[3] if best_solution[1] == 'poly' else 3
    }

    return best_params, logbook

In [None]:
def optimize_all_targets(X, y_df, population_size=50, generations=5):
    best_params_per_target = {}
    models = {}

    for column in y_df.columns:
        print(f"\nOptimizing for target: {column}")
        best_params, logbook = run_genetic_optimization(X, y_df[column],
                                                      population_size=population_size,
                                                      generations=generations)

        best_params_per_target[column] = best_params

        # Train final model with best parameters
        model = SVC(**best_params, random_state=42)
        model.fit(X, y_df[column])
        models[column] = model

        print(f"\nBest parameters for {column}:")
        for param, value in best_params.items():
            print(f"{param}: {value}")

    return best_params_per_target, models

In [None]:
X_processed, y_encoded, label_encoders = prepare_data(
    df["clean_text"],
    y
)

In [None]:
best_params_per_target, trained_models = optimize_all_targets(X_processed, y_encoded)


Optimizing for target: Acting
gen	nevals	avg     	std      	min     	max     
0  	50    	0.698794	0.0316926	0.646345	0.734983
1  	42    	0.7164  	0.0238985	0.646345	0.734983
2  	40    	0.726797	0.014168 	0.659229	0.734983
3  	35    	0.7289  	0.00965129	0.698962	0.740314
4  	40    	0.730089	0.0182152 	0.659229	0.740314
5  	40    	0.733448	0.00831028	0.698962	0.740314

Best parameters for Acting:
C: 100
kernel: rbf
gamma: 0.1
degree: 3

Optimizing for target: direction
gen	nevals	avg     	std      	min     	max     
0  	50    	0.735588	0.0263466	0.708418	0.775701
1  	41    	0.757355	0.0172131	0.708418	0.775701
2  	35    	0.7646  	0.0140745	0.708418	0.775701
3  	36    	0.764472	0.014822 	0.708658	0.775701
4  	39    	0.762424	0.0155674	0.710915	0.775701
5  	33    	0.768676	0.0113324	0.710915	0.775701

Best parameters for direction:
C: 100
kernel: rbf
gamma: 1
degree: 3

Optimizing for target: Music
gen	nevals	avg     	std      	min     	max     
0  	50    	0.878722	0.0459482	0.817211	0.93

In [None]:
def prepare_data(df, aspect_column):
    """Prepare data for a specific aspect"""
    # Encode text to numerical values if needed
    # le = LabelEncoder()
    # Changed to return a DataFrame instead of a Series
    X = df[['clean_text']]
    y = df[aspect_column]
    return X, y

In [None]:
def train_evaluate_model(X, y,aspect, sampling_strategy='none'):
    """Train and evaluate SVM model with different sampling techniques"""
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    vectorizer = TfidfVectorizer(max_features=5000,
                                ngram_range=(1, 2),
                                min_df=2,
                                max_df=0.95)

    # Transform features and encode labels to categorical
    X_tfidf = vectorizer.fit_transform(X['clean_text'])

    # Encode target labels to categorical for CategoricalNB
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # Print the mapping between original and encoded labels
    print("Label Mapping:")
    for original_label, encoded_label in zip(le.classes_, le.transform(le.classes_)):
        print(f"{original_label} -> {encoded_label}")

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_tfidf, y_encoded)):  # Split using encoded labels
        X_train, X_val = X_tfidf[train_idx], X_tfidf[val_idx]
        y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]  # Use encoded labels

        # Apply sampling strategy
        if sampling_strategy == 'oversample':
            sampler = SMOTE(random_state=42)
            X_train, y_train = sampler.fit_resample(X_train, y_train)
        elif sampling_strategy == 'undersample':
            sampler = RandomUnderSampler(random_state=42)
            X_train, y_train = sampler.fit_resample(X_train, y_train)

        # Convert X_train and X_val to dense arrays
        # X_train = X_train.toarray()
        # X_val = X_val.toarray()

        if aspect == "acting":
          model =   SVC(random_state=42, C=100 ,kernel="rbf" , gamma=0.1 ,degree=3)
        elif aspect == "direction":
          model =   SVC(random_state=42, C=10 ,kernel="rbf" , gamma=0.1 ,degree=3)
        elif aspect == "music":
          model =   SVC(random_state=42, C=1 ,kernel="linear" , gamma=0.1 ,degree=3)
        elif aspect == "genre":
          model =   SVC(random_state=42, C=1 ,kernel="sigmoid" , gamma="scale" ,degree=3)
        elif aspect == "excitement":
          model =   SVC(random_state=42, C=10 ,kernel="rbf" , gamma=0.1 ,degree=3)
        elif aspect == "ovr_sent":
          model =   SVC(random_state=42, C=10 ,kernel="rbf" , gamma="scale" ,degree=3)
        else:
          raise ValueError(f"Unknown aspect: {aspect}")

        # Change CategoricalNB to MultinomialNB to handle TF-IDF features
        # model = CategoricalNB()

        model.fit(X_train, y_train)

        # Evaluate
        y_pred = model.predict(X_val)
        score = accuracy_score(y_val, y_pred)  # Use encoded labels for evaluation
        scores.append(score)

        print(f"\nFold {fold + 1} Results:")
        print(classification_report(y_val, y_pred))  # Use encoded labels for classification report

    return np.mean(scores), np.std(scores)

In [None]:
def analyze_all_aspects_undersample(df, aspects):
    """Analyze all aspects with different sampling techniques"""
    results = {}
    # sampling_techniques = ['none', 'oversample', 'undersample']
    sampling_techniques = ['undersample']

    for aspect in aspects:
        print(f"\nAnalyzing aspect: {aspect}")
        X, y = prepare_data(df, aspect)

        aspect_results = {}
        for technique in sampling_techniques:
            print(f"\nUsing {technique} sampling:")
            mean_score, std_score = train_evaluate_model(X, y, aspect.lower() ,technique)
            aspect_results[technique] = {
                'mean_accuracy': mean_score,
                'std_accuracy': std_score
            }
            print(f"Mean Accuracy: {mean_score:.4f} (±{std_score:.4f})")

        results[aspect] = aspect_results

    return results

In [None]:
aspects = ['Acting', 'direction', 'Music',  'Genre', 'excitement','ovr_sent']
results = analyze_all_aspects_undersample(df, aspects)

NameError: name 'analyze_all_aspects_undersample' is not defined

In [None]:
def analyze_all_aspects_oversample(df, aspects):
    """Analyze all aspects with different sampling techniques"""
    results = {}
    # sampling_techniques = ['none', 'oversample', 'undersample']
    sampling_techniques = ['oversample']

    for aspect in aspects:
        print(f"\nAnalyzing aspect: {aspect}")
        X, y = prepare_data(df, aspect)

        aspect_results = {}
        for technique in sampling_techniques:
            print(f"\nUsing {technique} sampling:")
            mean_score, std_score = train_evaluate_model(X, y,aspect.lower(), technique)
            aspect_results[technique] = {
                'mean_accuracy': mean_score,
                'std_accuracy': std_score
            }
            print(f"Mean Accuracy: {mean_score:.4f} (±{std_score:.4f})")

        results[aspect] = aspect_results

    return results

In [None]:
aspects = ['Acting', 'direction', 'Music',  'Genre', 'excitement','ovr_sent']
results = analyze_all_aspects_oversample(df, aspects)


Analyzing aspect: Acting

Using oversample sampling:
Label Mapping:
-1 -> 0
0 -> 1
1 -> 2

Fold 1 Results:
              precision    recall  f1-score   support

           0       0.19      0.15      0.17        34
           1       0.88      0.92      0.90       770
           2       0.68      0.61      0.64       221

    accuracy                           0.82      1025
   macro avg       0.58      0.56      0.57      1025
weighted avg       0.81      0.82      0.82      1025


Fold 2 Results:
              precision    recall  f1-score   support

           0       0.14      0.12      0.13        33
           1       0.88      0.90      0.89       769
           2       0.65      0.60      0.62       222

    accuracy                           0.81      1024
   macro avg       0.55      0.54      0.55      1024
weighted avg       0.80      0.81      0.81      1024


Fold 3 Results:
              precision    recall  f1-score   support

           0       0.19      0.21      0.

In [None]:
aspects = ['Acting', 'direction', 'Music','ovr_sent']

In [None]:
def train_final_models_grid_search(df, aspects):
    """
    For each aspect, perform a grid search to find the best SVC parameters using a pipeline.
    Uses an 80/20 train-test split, prints training and test accuracy and classification reports,
    and returns the best models along with their label encoders.
    """
    best_models = {}
    train_reports = {}
    test_reports = {}

    # Define a parameter grid for the SVC inside a pipeline
    param_grid = {
        'clf__kernel': ['linear', 'rbf', 'sigmoid'],
        'clf__C': [1,10],
        'clf__gamma': [0.01, 0.1, 1]
    }

    for aspect in aspects:
        print(f"\n=== Processing aspect: {aspect} ===")
        X, y = prepare_data(df, aspect)
        # Encode target labels to numerical values
        le = LabelEncoder()
        y_encoded = le.fit_transform(y)

        # Split data into training and testing sets (80/20 split)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
        )

        # Create a pipeline: first vectorize text, then classify with SVC
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,
                                      ngram_range=(1, 2),
                                      min_df=2,
                                      max_df=0.95)),
            ('clf', SVC(random_state=42))
        ])

        # Set up GridSearchCV with 5-fold cross-validation
        cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        grid = GridSearchCV(pipeline, param_grid, cv=cv_strategy, scoring='accuracy', n_jobs=-1)
        grid.fit(X_train['clean_text'], y_train)

        print(f"Best parameters for aspect '{aspect}': {grid.best_params_}")

        best_estimator = grid.best_estimator_

        # Evaluate on training data
        y_train_pred = best_estimator.predict(X_train['clean_text'])
        train_acc = accuracy_score(y_train, y_train_pred)
        print("Training Accuracy: {:.4f}".format(train_acc))
        train_report = classification_report(y_train, y_train_pred)
        print("Training Classification Report:\n", train_report)

        # Evaluate on test data
        y_test_pred = best_estimator.predict(X_test['clean_text'])
        test_acc = accuracy_score(y_test, y_test_pred)
        print("Test Accuracy: {:.4f}".format(test_acc))
        test_report = classification_report(y_test, y_test_pred)
        print("Test Classification Report:\n", test_report)

        best_models[aspect.lower()] = {
            "model": best_estimator,
            "label_encoder": le
        }
        train_reports[aspect.lower()] = train_report
        test_reports[aspect.lower()] = test_report

    return best_models, train_reports, test_reports

In [None]:
def predict_all_aspects(sentence, final_models):
    """
    Given a sentence, predict its sentiment for all aspects using the final trained models.
    Returns a dictionary with aspect names as keys and predicted labels as values.
    """
    predictions = {}
    for aspect, model_info in final_models.items():
        model = model_info['model']
        le = model_info['label_encoder']
        # The pipeline expects raw text
        pred_encoded = model.predict([sentence])[0]
        pred_label = le.inverse_transform([pred_encoded])[0]
        predictions[aspect] = pred_label
    return predictions

In [None]:
best_models, train_reports, test_reports = train_final_models_grid_search(df, aspects)


=== Processing aspect: Acting ===
Best parameters for aspect 'Acting': {'clf__C': 1, 'clf__gamma': 0.01, 'clf__kernel': 'linear'}
Training Accuracy: 0.9158
Training Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.21      0.35       132
           1       0.91      0.99      0.95      2966
           2       0.93      0.76      0.84       843

    accuracy                           0.92      3941
   macro avg       0.95      0.66      0.71      3941
weighted avg       0.92      0.92      0.91      3941

Test Accuracy: 0.8438
Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.03      0.06        33
           1       0.86      0.95      0.91       742
           2       0.74      0.59      0.66       211

    accuracy                           0.84       986
   macro avg       0.87      0.52      0.54       986
weighted avg       0.84      0.84      0.82       98

In [None]:
import pickle

# # Save the models to a pickle file
# with open('sentiment_models_SVM.pkl', 'wb') as f:
#     pickle.dump(best_models, f)

# Load the models from the pickle file
with open('sentiment_models_SVM.pkl', 'rb') as f:
    loaded_models = pickle.load(f)

# Function to predict all aspects using the loaded models
def predict_all_aspects_loaded(sentence, loaded_models):
    predictions = {}
    for aspect, model_info in loaded_models.items():
        model = model_info['model']
        le = model_info['label_encoder']
        pred_encoded = model.predict([sentence])[0]
        pred_label = le.inverse_transform([pred_encoded])[0]
        predictions[aspect] = pred_label
    return predictions



In [None]:
# Example usage with a sample sentence
sample_sentence = "Yuvam bgm nalla potu irukaaru"
predictions = predict_all_aspects_loaded(sample_sentence, loaded_models)

print("\nPredictions for the sample sentence:")
for aspect, pred in predictions.items():
    print(f"{aspect.capitalize()}: {pred}")


Predictions for the sample sentence:
Acting: 0
Direction: 0
Music: 1
Ovr_sent: 1


In [None]:
sample_sentence = "superster nalla illa "
predictions = predict_all_aspects(sample_sentence, best_models)

print("\nPredictions for the sample sentence:")
for aspect, pred in predictions.items():
    print(f"{aspect.capitalize()}: {pred}")

Models loaded from final_models_dl.pkl


ValueError: Unrecognized data type: x=['superster nalla illa '] (of type <class 'list'>)

In [None]:
def custom_tokenizer(text):
    return text.split()

In [None]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")
def train_final_models_grid_search(train_df, test_df, aspects):
    """
    For each aspect:
    1. Perform grid search on the training data to find the best parameters.
    2. Using the best parameters, perform 10-fold cross-validation,
       returning both the average accuracy and the list of fold estimators.
    3. Select the best estimator from CV (highest test score) and evaluate it on test data.
    Returns the best models along with their label encoders and classification reports.
    """
    best_models = {}
    train_reports = {}
    test_reports = {}

    # Parameter grid for the classifier
    param_grid = {
        'clf__kernel': ['linear', 'rbf', 'sigmoid'],
        'clf__C': [0.1, 1, 10, 100],
        'clf__gamma': ['scale', 'auto', 0.01, 0.1, 1]
    }

    for aspect in aspects:
        print(f"\n=== Processing aspect: {aspect} ===")
        # Prepare data for current aspect
        X_train, y_train = prepare_data(train_df, aspect)
        X_test, y_test = prepare_data(test_df, aspect)

        # Encode target labels to numerical values
        le = LabelEncoder()
        y_train_encoded = le.fit_transform(y_train)
        y_test_encoded = le.fit_transform(y_test)

        # Build initial pipeline with a placeholder classifier
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,
                                      ngram_range=(1, 2),
                                      min_df=5,
                                      max_df=0.90,
                                      tokenizer=custom_tokenizer)),
            ('clf', SVC(random_state=42, class_weight='balanced'))
        ])

        # 10-fold CV strategy for grid search
        cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

        # Ensure there are no missing or empty texts in training set
        X_train = X_train.dropna(subset=['clean_text'])
        X_train = X_train[X_train['clean_text'].str.strip() != '']

        # --- Step 1: Grid Search ---
        grid = GridSearchCV(pipeline, param_grid, cv=cv_strategy,
                            scoring='balanced_accuracy', n_jobs=-1, return_train_score=True)
        grid.fit(X_train['clean_text'], y_train_encoded)
        best_params = grid.best_params_
        print(f"Best parameters for aspect '{aspect}': {best_params}")

        # --- Step 2: Build a new pipeline using the best parameters ---
        best_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,
                                      ngram_range=(1, 2),
                                      min_df=5,
                                      max_df=0.90,
                                      tokenizer=custom_tokenizer)),
            ('clf', SVC(random_state=42,
                                             class_weight='balanced',
                                             C=best_params['clf__C'],
                                             gamma=best_params['clf__gamma'],
                                             kernel=best_params['clf__kernel']
                                             ))
        ])

        # --- Step 3: Perform 10-Fold Cross Validation with estimator return ---
        cv_results = cross_validate(best_pipeline, X_train['clean_text'], y_train_encoded,
                                    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
                                    scoring='balanced_accuracy', return_train_score=True,return_estimator=True, n_jobs=-1)
        avg_cv_score_test = cv_results['test_score'].mean()
        avg_cv_score_train = cv_results['train_score'].mean()
        print(avg_cv_score_test)
        print(avg_cv_score_train)


        print(f"Average 10-fold CV test Accuracy for aspect '{aspect}': {avg_cv_score_test:.4f}")
        print(f"Average 10-fold CV train Accuracy for aspect '{aspect}': {avg_cv_score_train:.4f}")

        # --- Step 4: Select the best estimator from the CV folds ---
        best_index = np.argmax(cv_results['test_score'])
        best_cv_estimator = cv_results['estimator'][best_index]

        # Evaluate the selected estimator on the training data (optional)
        y_train_pred = best_cv_estimator.predict(X_train['clean_text'])
        train_acc = accuracy_score(y_train_encoded, y_train_pred)
        print("Final Training Accuracy: {:.4f}".format(train_acc))
        train_report = classification_report(y_train_encoded, y_train_pred)
        print("Final Training Classification Report:\n", train_report)

        # --- Step 5: Evaluate on Test Data ---
        y_test_pred = best_cv_estimator.predict(X_test['clean_text'])
        test_acc = accuracy_score(y_test_encoded, y_test_pred)
        print("Test Accuracy: {:.4f}".format(test_acc))
        test_report = classification_report(y_test_encoded, y_test_pred)
        print("Test Classification Report:\n", test_report)

        # Save results for the current aspect
        best_models[aspect.lower()] = {
            "model": best_cv_estimator,
            "label_encoder": le
        }
        train_reports[aspect.lower()] = train_report
        test_reports[aspect.lower()] = test_report

    return best_models, train_reports, test_reports


In [None]:
aspects = ['Acting', 'direction', 'Music','ovr_sent']

In [None]:
best_models, train_reports, test_reports = train_final_models_grid_search(train_df,test_df, aspects)


=== Processing aspect: Acting ===




Best parameters for aspect 'Acting': {'clf__C': 1, 'clf__gamma': 1, 'clf__kernel': 'sigmoid'}
0.6288446022269552
0.8182376005560892
Average 10-fold CV test Accuracy for aspect 'Acting': 0.6288
Average 10-fold CV train Accuracy for aspect 'Acting': 0.8182
Final Training Accuracy: 0.7673
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.19      0.89      0.31       131
           1       0.94      0.77      0.85      2962
           2       0.70      0.75      0.72       848

    accuracy                           0.77      3941
   macro avg       0.61      0.80      0.63      3941
weighted avg       0.87      0.77      0.80      3941

Test Accuracy: 0.7241
Test Classification Report:
               precision    recall  f1-score   support

           0       0.10      0.44      0.16        34
           1       0.91      0.75      0.82       746
           2       0.63      0.67      0.65       206

    accuracy            



Best parameters for aspect 'direction': {'clf__C': 1, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}
0.6511047158939374
0.8024938187587607
Average 10-fold CV test Accuracy for aspect 'direction': 0.6511
Average 10-fold CV train Accuracy for aspect 'direction': 0.8025
Final Training Accuracy: 0.7846
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.82      0.58       568
           1       0.96      0.78      0.86      3128
           2       0.55      0.77      0.64       245

    accuracy                           0.78      3941
   macro avg       0.65      0.79      0.69      3941
weighted avg       0.86      0.78      0.81      3941

Test Accuracy: 0.7201
Test Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.69      0.44       124
           1       0.93      0.74      0.82       804
           2       0.36      0.55      0.43        58

    accuracy     



Best parameters for aspect 'Music': {'clf__C': 100, 'clf__gamma': 'auto', 'clf__kernel': 'sigmoid'}
0.7241765038732287
0.8702183734530772
Average 10-fold CV test Accuracy for aspect 'Music': 0.7242
Average 10-fold CV train Accuracy for aspect 'Music': 0.8702
Final Training Accuracy: 0.9457
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.92      0.61        89
           1       0.98      0.98      0.98      3446
           2       0.89      0.69      0.78       406

    accuracy                           0.95      3941
   macro avg       0.77      0.86      0.79      3941
weighted avg       0.96      0.95      0.95      3941

Test Accuracy: 0.9432
Test Classification Report:
               precision    recall  f1-score   support

           0       0.29      0.56      0.38        18
           1       0.98      0.97      0.98       867
           2       0.87      0.74      0.80       101

    accuracy        



Best parameters for aspect 'ovr_sent': {'clf__C': 1, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}
0.6322870910571186
0.909353108355892
Average 10-fold CV test Accuracy for aspect 'ovr_sent': 0.6323
Average 10-fold CV train Accuracy for aspect 'ovr_sent': 0.9094
Final Training Accuracy: 0.8754
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      1346
           1       0.66      0.91      0.77       567
           2       0.98      0.85      0.91      2028

    accuracy                           0.88      3941
   macro avg       0.84      0.89      0.85      3941
weighted avg       0.89      0.88      0.88      3941

Test Accuracy: 0.6836
Test Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.70      0.69       348
           1       0.34      0.47      0.40       139
           2       0.84      0.73      0.78       499

    accuracy     

In [None]:
def prepare_data(df, aspect_column):
    """Prepare data for a specific aspect"""
    # Encode text to numerical values if needed
    # le = LabelEncoder()
    # Changed to return a DataFrame instead of a Series
    X = df[['R_clean_text']]
    y = df[aspect_column]
    return X, y

In [None]:
train_df = pd.read_csv("Romanised_train_data.csv")
test_df = pd.read_csv("Romanised_test_data.csv")
def train_final_models_grid_search(train_df, test_df, aspects):
    """
    For each aspect:
    1. Perform grid search on the training data to find the best parameters.
    2. Using the best parameters, perform 10-fold cross-validation,
       returning both the average accuracy and the list of fold estimators.
    3. Select the best estimator from CV (highest test score) and evaluate it on test data.
    Returns the best models along with their label encoders and classification reports.
    """
    best_models = {}
    train_reports = {}
    test_reports = {}

    # Parameter grid for the classifier
    param_grid = {
        'clf__kernel': ['linear', 'rbf', 'sigmoid'],
        'clf__C': [0.1, 1, 10, 100],
        'clf__gamma': ['scale', 'auto', 0.01, 0.1, 1]
    }

    for aspect in aspects:
        print(f"\n=== Processing aspect: {aspect} ===")
        # Prepare data for current aspect
        X_train, y_train = prepare_data(train_df, aspect)
        X_test, y_test = prepare_data(test_df, aspect)

        # Encode target labels to numerical values
        le = LabelEncoder()
        y_train_encoded = le.fit_transform(y_train)
        y_test_encoded = le.fit_transform(y_test)

        # Build initial pipeline with a placeholder classifier
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,
                                      ngram_range=(1, 2),
                                      min_df=5,
                                      max_df=0.90,
                                      tokenizer=custom_tokenizer)),
            ('clf', SVC(random_state=42, class_weight='balanced'))
        ])

        # 10-fold CV strategy for grid search
        cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

        # Ensure there are no missing or empty texts in training set
        X_train = X_train.dropna(subset=['R_clean_text'])
        X_train = X_train[X_train['R_clean_text'].str.strip() != '']

        # --- Step 1: Grid Search ---
        grid = GridSearchCV(pipeline, param_grid, cv=cv_strategy,
                            scoring='balanced_accuracy', n_jobs=-1, return_train_score=True)
        grid.fit(X_train['R_clean_text'], y_train_encoded)
        best_params = grid.best_params_
        print(f"Best parameters for aspect '{aspect}': {best_params}")

        # --- Step 2: Build a new pipeline using the best parameters ---
        best_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,
                                      ngram_range=(1, 2),
                                      min_df=5,
                                      max_df=0.90,
                                      tokenizer=custom_tokenizer)),
            ('clf', SVC(random_state=42,
                                             class_weight='balanced',
                                             C=best_params['clf__C'],
                                             gamma=best_params['clf__gamma'],
                                             kernel=best_params['clf__kernel']
                                             ))
        ])

        # --- Step 3: Perform 10-Fold Cross Validation with estimator return ---
        cv_results = cross_validate(best_pipeline, X_train['R_clean_text'], y_train_encoded,
                                    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
                                    scoring='balanced_accuracy', return_train_score=True,return_estimator=True, n_jobs=-1)
        avg_cv_score_test = cv_results['test_score'].mean()
        avg_cv_score_train = cv_results['train_score'].mean()
        print(avg_cv_score_test)
        print(avg_cv_score_train)


        print(f"Average 10-fold CV test Accuracy for aspect '{aspect}': {avg_cv_score_test:.4f}")
        print(f"Average 10-fold CV train Accuracy for aspect '{aspect}': {avg_cv_score_train:.4f}")

        # --- Step 4: Select the best estimator from the CV folds ---
        best_index = np.argmax(cv_results['test_score'])
        best_cv_estimator = cv_results['estimator'][best_index]

        # Evaluate the selected estimator on the training data (optional)
        y_train_pred = best_cv_estimator.predict(X_train['R_clean_text'])
        train_acc = accuracy_score(y_train_encoded, y_train_pred)
        print("Final Training Accuracy: {:.4f}".format(train_acc))
        train_report = classification_report(y_train_encoded, y_train_pred)
        print("Final Training Classification Report:\n", train_report)

        # --- Step 5: Evaluate on Test Data ---
        y_test_pred = best_cv_estimator.predict(X_test['R_clean_text'])
        test_acc = accuracy_score(y_test_encoded, y_test_pred)
        print("Test Accuracy: {:.4f}".format(test_acc))
        test_report = classification_report(y_test_encoded, y_test_pred)
        print("Test Classification Report:\n", test_report)

        # Save results for the current aspect
        best_models[aspect.lower()] = {
            "model": best_cv_estimator,
            "label_encoder": le
        }
        train_reports[aspect.lower()] = train_report
        test_reports[aspect.lower()] = test_report

    return best_models, train_reports, test_reports


In [None]:
best_models, train_reports, test_reports = train_final_models_grid_search(train_df,test_df, aspects)


=== Processing aspect: Acting ===




Best parameters for aspect 'Acting': {'clf__C': 1, 'clf__gamma': 1, 'clf__kernel': 'sigmoid'}
0.6439033268445034
0.8251112211299987
Average 10-fold CV test Accuracy for aspect 'Acting': 0.6439
Average 10-fold CV train Accuracy for aspect 'Acting': 0.8251
Final Training Accuracy: 0.7808
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.21      0.89      0.34       131
           1       0.94      0.78      0.86      2962
           2       0.69      0.75      0.72       848

    accuracy                           0.78      3941
   macro avg       0.61      0.81      0.64      3941
weighted avg       0.86      0.78      0.81      3941

Test Accuracy: 0.7505
Test Classification Report:
               precision    recall  f1-score   support

           0       0.12      0.47      0.20        34
           1       0.92      0.78      0.84       746
           2       0.64      0.67      0.66       206

    accuracy            



Best parameters for aspect 'direction': {'clf__C': 100, 'clf__gamma': 'auto', 'clf__kernel': 'rbf'}
0.6686530734553268
0.8106188945335802
Average 10-fold CV test Accuracy for aspect 'direction': 0.6687
Average 10-fold CV train Accuracy for aspect 'direction': 0.8106
Final Training Accuracy: 0.7896
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.84      0.60       568
           1       0.96      0.78      0.86      3128
           2       0.51      0.80      0.62       245

    accuracy                           0.79      3941
   macro avg       0.65      0.80      0.69      3941
weighted avg       0.86      0.79      0.81      3941

Test Accuracy: 0.7059
Test Classification Report:
               precision    recall  f1-score   support

           0       0.31      0.65      0.42       124
           1       0.93      0.72      0.81       804
           2       0.34      0.62      0.44        58

    accuracy



Best parameters for aspect 'Music': {'clf__C': 1, 'clf__gamma': 1, 'clf__kernel': 'sigmoid'}
0.7189723658527452
0.8905220069044676
Average 10-fold CV test Accuracy for aspect 'Music': 0.7190
Average 10-fold CV train Accuracy for aspect 'Music': 0.8905
Final Training Accuracy: 0.9137
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.28      0.92      0.43        89
           1       0.99      0.92      0.96      3446
           2       0.77      0.82      0.80       406

    accuracy                           0.91      3941
   macro avg       0.68      0.89      0.73      3941
weighted avg       0.95      0.91      0.93      3941

Test Accuracy: 0.9057
Test Classification Report:
               precision    recall  f1-score   support

           0       0.16      0.56      0.24        18
           1       0.99      0.92      0.95       867
           2       0.74      0.83      0.79       101

    accuracy               



Best parameters for aspect 'ovr_sent': {'clf__C': 100, 'clf__gamma': 0.01, 'clf__kernel': 'sigmoid'}
0.6413702035855965
0.7950802730670701
Average 10-fold CV test Accuracy for aspect 'ovr_sent': 0.6414
Average 10-fold CV train Accuracy for aspect 'ovr_sent': 0.7951
Final Training Accuracy: 0.7686
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.79      0.79      1346
           1       0.47      0.81      0.59       567
           2       0.94      0.74      0.83      2028

    accuracy                           0.77      3941
   macro avg       0.73      0.78      0.74      3941
weighted avg       0.82      0.77      0.78      3941

Test Accuracy: 0.6613
Test Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.66      0.68       348
           1       0.31      0.55      0.39       139
           2       0.84      0.69      0.76       499

    accuracy 