<a href="https://colab.research.google.com/github/AshvinVignesh/Final_year/blob/main/final_year_XGBOOST_and_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier




In [None]:
import pickle
import os

def custom_tokenizer(text):
    return text.split()

def save_models(models, filename="final_models.pkl"):
    """Save trained models and label encoders to a pickle file."""
    with open(filename, 'wb') as f:
        pickle.dump(models, f)
    print(f"Models saved to {filename}")

def load_models(filename="final_models.pkl"):
    """Load trained models and label encoders from a pickle file."""
    if not os.path.exists(filename):
        raise FileNotFoundError(f"The file {filename} does not exist.")
    with open(filename, 'rb') as f:
        models = pickle.load(f)
    print(f"Models loaded from {filename}")
    return models

def predict_all_aspects_from_file(sentence, model_filename="final_models.pkl"):
    """Load the saved models and predict aspects for a given sentence."""
    models = load_models(model_filename)
    predictions = {}

    for aspect, model_info in models.items():
        model = model_info['model']
        le = model_info['label_encoder']
        pred_encoded = model.predict([sentence])[0]
        pred_label = le.inverse_transform([pred_encoded])[0]
        predictions[aspect] = pred_label

    return predictions

In [None]:
def prepare_data(df, aspect_column):

    X = df[['clean_text']]
    y = df[aspect_column]
    return X, y

In [None]:
def prepare_data(df, aspect_column):

    X = df[['R_clean_text']]
    y = df[aspect_column]
    return X, y

In [None]:

train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

def train_final_models_grid_search(train_df, test_df, aspects):
    """
    For each aspect:
    1. Perform grid search on the training data to find the best parameters.
    2. Using the best parameters, perform 10-fold cross-validation,
       returning both the average accuracy and the list of fold estimators.
    3. Select the best estimator from CV (highest test score) and evaluate it on test data.
    Returns the best models along with their label encoders and classification reports.
    """
    best_models = {}
    train_reports = {}
    test_reports = {}

    # Parameter grid for the XGBoost classifier
    param_grid = {
        'clf__learning_rate': [0.01, 0.1, 0.2],
        'clf__max_depth': [3, 5, 7],
        'clf__n_estimators': [100, 200, 300],
        'clf__subsample': [0.8, 1.0]
    }

    for aspect in aspects:
        print(f"\n=== Processing aspect: {aspect} ===")
        # Prepare data for current aspect
        X_train, y_train = prepare_data(train_df, aspect)
        X_test, y_test = prepare_data(test_df, aspect)

        # Encode target labels to numerical values
        le = LabelEncoder()
        y_train_encoded = le.fit_transform(y_train)
        y_test_encoded = le.fit_transform(y_test)

        # Build initial pipeline with XGBoost classifier
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,
                                        ngram_range=(1, 2),
                                        min_df=5,
                                        max_df=0.90,
                                        tokenizer=custom_tokenizer)),
            ('clf', XGBClassifier(random_state=42,
                                  use_label_encoder=False,
                                  eval_metric="logloss"))
        ])

        # 10-fold CV strategy for grid search
        cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

        # Ensure there are no missing or empty texts in training set
        X_train = X_train.dropna(subset=['clean_text'])
        X_train = X_train[X_train['clean_text'].str.strip() != '']

        # --- Step 1: Grid Search ---
        grid = GridSearchCV(pipeline, param_grid, cv=cv_strategy,
                            scoring='balanced_accuracy', n_jobs=-1, return_train_score=True)
        grid.fit(X_train['clean_text'], y_train_encoded)
        best_params = grid.best_params_
        print(f"Best parameters for aspect '{aspect}': {best_params}")

        # --- Step 2: Build a new pipeline using the best parameters ---
        best_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,
                                        ngram_range=(1, 2),
                                        min_df=5,
                                        max_df=0.90,
                                        tokenizer=custom_tokenizer)),
            ('clf', XGBClassifier(random_state=42,
                                  use_label_encoder=False,
                                  eval_metric="logloss",
                                  learning_rate=best_params['clf__learning_rate'],
                                  max_depth=best_params['clf__max_depth'],
                                  n_estimators=best_params['clf__n_estimators'],
                                  subsample=best_params['clf__subsample']))
        ])

        # --- Step 3: Perform 10-Fold Cross Validation with estimator return ---
        cv_results = cross_validate(best_pipeline, X_train['clean_text'], y_train_encoded,
                                    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
                                    scoring='balanced_accuracy', return_train_score=True,
                                    return_estimator=True, n_jobs=-1)
        avg_cv_score_test = cv_results['test_score'].mean()
        avg_cv_score_train = cv_results['train_score'].mean()
        print(f"Average 10-fold CV test Accuracy for aspect '{aspect}': {avg_cv_score_test:.4f}")
        print(f"Average 10-fold CV train Accuracy for aspect '{aspect}': {avg_cv_score_train:.4f}")

        # --- Step 4: Select the best estimator from the CV folds ---
        best_index = np.argmax(cv_results['test_score'])
        best_cv_estimator = cv_results['estimator'][best_index]

        # Evaluate the selected estimator on the training data (optional)
        y_train_pred = best_cv_estimator.predict(X_train['clean_text'])
        train_acc = accuracy_score(y_train_encoded, y_train_pred)
        print("Final Training Accuracy: {:.4f}".format(train_acc))
        train_report = classification_report(y_train_encoded, y_train_pred)
        print("Final Training Classification Report:\n", train_report)

        # --- Step 5: Evaluate on Test Data ---
        y_test_pred = best_cv_estimator.predict(X_test['clean_text'])
        test_acc = accuracy_score(y_test_encoded, y_test_pred)
        print("Test Accuracy: {:.4f}".format(test_acc))
        test_report = classification_report(y_test_encoded, y_test_pred)
        print("Test Classification Report:\n", test_report)

        # Save results for the current aspect
        best_models[aspect.lower()] = {
            "model": best_cv_estimator,
            "label_encoder": le
        }
        train_reports[aspect.lower()] = train_report
        test_reports[aspect.lower()] = test_report

    return best_models, train_reports, test_reports

In [None]:
aspects = ['Acting', 'direction', 'Music','ovr_sent']

In [None]:
best_models, train_reports, test_reports = train_final_models_grid_search(train_df,test_df, aspects)


=== Processing aspect: Acting ===


Parameters: { "use_label_encoder" } are not used.



Best parameters for aspect 'Acting': {'clf__learning_rate': 0.2, 'clf__max_depth': 5, 'clf__n_estimators': 300, 'clf__subsample': 0.8}
Average 10-fold CV test Accuracy for aspect 'Acting': 0.5464
Average 10-fold CV train Accuracy for aspect 'Acting': 0.8060
Final Training Accuracy: 0.9312
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.56      0.71       131
           1       0.93      0.99      0.96      2962
           2       0.93      0.79      0.85       848

    accuracy                           0.93      3941
   macro avg       0.95      0.78      0.84      3941
weighted avg       0.93      0.93      0.93      3941

Test Accuracy: 0.8529
Test Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.03      0.04        34
           1       0.88      0.96      0.92       746
           2       0.77      0.61      0.68       206

    accuracy         

Parameters: { "use_label_encoder" } are not used.



Best parameters for aspect 'direction': {'clf__learning_rate': 0.2, 'clf__max_depth': 7, 'clf__n_estimators': 300, 'clf__subsample': 0.8}
Average 10-fold CV test Accuracy for aspect 'direction': 0.5178
Average 10-fold CV train Accuracy for aspect 'direction': 0.8727
Final Training Accuracy: 0.9437
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.76      0.84       568
           1       0.94      0.99      0.97      3128
           2       0.93      0.76      0.84       245

    accuracy                           0.94      3941
   macro avg       0.94      0.84      0.88      3941
weighted avg       0.94      0.94      0.94      3941

Test Accuracy: 0.8124
Test Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.27      0.32       124
           1       0.87      0.93      0.90       804
           2       0.53      0.34      0.42        58

    accuracy

Parameters: { "use_label_encoder" } are not used.



Best parameters for aspect 'Music': {'clf__learning_rate': 0.2, 'clf__max_depth': 3, 'clf__n_estimators': 300, 'clf__subsample': 1.0}
Average 10-fold CV test Accuracy for aspect 'Music': 0.6700
Average 10-fold CV train Accuracy for aspect 'Music': 0.8664
Final Training Accuracy: 0.9772
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.74      0.84        89
           1       0.98      1.00      0.99      3446
           2       0.98      0.84      0.90       406

    accuracy                           0.98      3941
   macro avg       0.97      0.86      0.91      3941
weighted avg       0.98      0.98      0.98      3941

Test Accuracy: 0.9544
Test Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.17      0.25        18
           1       0.97      0.99      0.98       867
           2       0.80      0.77      0.79       101

    accuracy            

Parameters: { "use_label_encoder" } are not used.



Best parameters for aspect 'ovr_sent': {'clf__learning_rate': 0.2, 'clf__max_depth': 7, 'clf__n_estimators': 300, 'clf__subsample': 0.8}
Average 10-fold CV test Accuracy for aspect 'ovr_sent': 0.5622
Average 10-fold CV train Accuracy for aspect 'ovr_sent': 0.8215
Final Training Accuracy: 0.8622
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.86      0.87      1346
           1       0.88      0.60      0.71       567
           2       0.85      0.94      0.89      2028

    accuracy                           0.86      3941
   macro avg       0.87      0.80      0.82      3941
weighted avg       0.86      0.86      0.86      3941

Test Accuracy: 0.6846
Test Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.63      0.65       348
           1       0.37      0.18      0.24       139
           2       0.72      0.86      0.79       499

    accuracy   

In [None]:
best_models

{'acting': {'model': Pipeline(steps=[('tfidf',
                   TfidfVectorizer(max_df=0.9, max_features=5000, min_df=5,
                                   ngram_range=(1, 2),
                                   tokenizer=<function custom_tokenizer at 0x7f81f8fd7a60>)),
                  ('clf',
                   XGBClassifier(base_score=None, booster=None, callbacks=None,
                                 colsample_bylevel=None, colsample_bynode=None,
                                 colsample_bytree=None, device=None,
                                 early_stopping_rounds=None,
                                 enable_categorical=False, ev...
                                 feature_types=None, gamma=None, grow_policy=None,
                                 importance_type=None,
                                 interaction_constraints=None, learning_rate=0.2,
                                 max_bin=None, max_cat_threshold=None,
                                 max_cat_to_onehot=None,

In [None]:
save_models(best_models)

PicklingError: Can't pickle <function custom_tokenizer at 0x7f81f8fd7a60>: it's not the same object as __main__.custom_tokenizer

In [None]:
# Assuming custom_tokenizer and prepare_data are defined elsewhere
train_df = pd.read_csv("Romanised_train_data.csv")
test_df = pd.read_csv("Romanised_test_data.csv")

def train_final_models_grid_search(train_df, test_df, aspects):
    """
    For each aspect:
    1. Perform grid search on the training data to find the best parameters.
    2. Using the best parameters, perform 10-fold cross-validation,
       returning both the average accuracy and the list of fold estimators.
    3. Select the best estimator from CV (highest test score) and evaluate it on test data.
    Returns the best models along with their label encoders and classification reports.
    """
    best_models = {}
    train_reports = {}
    test_reports = {}

    # Parameter grid for the XGBoost classifier
    param_grid = {
        'clf__learning_rate': [0.01, 0.1, 0.2],
        'clf__max_depth': [3, 5, 7],
        'clf__n_estimators': [100, 200, 300],
        'clf__subsample': [0.8, 1.0]
    }

    for aspect in aspects:
        print(f"\n=== Processing aspect: {aspect} ===")
        # Prepare data for current aspect
        X_train, y_train = prepare_data(train_df, aspect)
        X_test, y_test = prepare_data(test_df, aspect)

        # Encode target labels to numerical values
        le = LabelEncoder()
        y_train_encoded = le.fit_transform(y_train)
        y_test_encoded = le.fit_transform(y_test)

        # Build initial pipeline with XGBoost classifier
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,
                                        ngram_range=(1, 2),
                                        min_df=5,
                                        max_df=0.90,
                                        tokenizer=custom_tokenizer)),
            ('clf', XGBClassifier(random_state=42,
                                  use_label_encoder=False,
                                  eval_metric="logloss"))
        ])

        # 10-fold CV strategy for grid search
        cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

        # Ensure there are no missing or empty texts in training set
        X_train = X_train.dropna(subset=['R_clean_text'])
        X_train = X_train[X_train['R_clean_text'].str.strip() != '']

        # --- Step 1: Grid Search ---
        grid = GridSearchCV(pipeline, param_grid, cv=cv_strategy,
                            scoring='balanced_accuracy', n_jobs=-1, return_train_score=True)
        grid.fit(X_train['R_clean_text'], y_train_encoded)
        best_params = grid.best_params_
        print(f"Best parameters for aspect '{aspect}': {best_params}")

        # --- Step 2: Build a new pipeline using the best parameters ---
        best_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000,
                                        ngram_range=(1, 2),
                                        min_df=5,
                                        max_df=0.90,
                                        tokenizer=custom_tokenizer)),
            ('clf', XGBClassifier(random_state=42,
                                  use_label_encoder=False,
                                  eval_metric="logloss",
                                  learning_rate=best_params['clf__learning_rate'],
                                  max_depth=best_params['clf__max_depth'],
                                  n_estimators=best_params['clf__n_estimators'],
                                  subsample=best_params['clf__subsample']))
        ])

        # --- Step 3: Perform 10-Fold Cross Validation with estimator return ---
        cv_results = cross_validate(best_pipeline, X_train['R_clean_text'], y_train_encoded,
                                    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
                                    scoring='balanced_accuracy', return_train_score=True,
                                    return_estimator=True, n_jobs=-1)
        avg_cv_score_test = cv_results['test_score'].mean()
        avg_cv_score_train = cv_results['train_score'].mean()
        print(f"Average 10-fold CV test Accuracy for aspect '{aspect}': {avg_cv_score_test:.4f}")
        print(f"Average 10-fold CV train Accuracy for aspect '{aspect}': {avg_cv_score_train:.4f}")

        # --- Step 4: Select the best estimator from the CV folds ---
        best_index = np.argmax(cv_results['test_score'])
        best_cv_estimator = cv_results['estimator'][best_index]

        # Evaluate the selected estimator on the training data (optional)
        y_train_pred = best_cv_estimator.predict(X_train['R_clean_text'])
        train_acc = accuracy_score(y_train_encoded, y_train_pred)
        print("Final Training Accuracy: {:.4f}".format(train_acc))
        train_report = classification_report(y_train_encoded, y_train_pred)
        print("Final Training Classification Report:\n", train_report)

        # --- Step 5: Evaluate on Test Data ---
        y_test_pred = best_cv_estimator.predict(X_test['R_clean_text'])
        test_acc = accuracy_score(y_test_encoded, y_test_pred)
        print("Test Accuracy: {:.4f}".format(test_acc))
        test_report = classification_report(y_test_encoded, y_test_pred)
        print("Test Classification Report:\n", test_report)

        # Save results for the current aspect
        best_models[aspect.lower()] = {
            "model": best_cv_estimator,
            "label_encoder": le
        }
        train_reports[aspect.lower()] = train_report
        test_reports[aspect.lower()] = test_report

    return best_models, train_reports, test_reports

In [None]:
best_models, train_reports, test_reports = train_final_models_grid_search(train_df,test_df, aspects)


=== Processing aspect: Acting ===


Parameters: { "use_label_encoder" } are not used.



Best parameters for aspect 'Acting': {'clf__learning_rate': 0.2, 'clf__max_depth': 5, 'clf__n_estimators': 300, 'clf__subsample': 1.0}
Average 10-fold CV test Accuracy for aspect 'Acting': 0.5611
Average 10-fold CV train Accuracy for aspect 'Acting': 0.8193
Final Training Accuracy: 0.9386
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.60      0.75       131
           1       0.93      0.99      0.96      2962
           2       0.96      0.80      0.87       848

    accuracy                           0.94      3941
   macro avg       0.96      0.80      0.86      3941
weighted avg       0.94      0.94      0.94      3941

Test Accuracy: 0.8580
Test Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.21      0.27        34
           1       0.88      0.96      0.92       746
           2       0.78      0.61      0.69       206

    accuracy         

Parameters: { "use_label_encoder" } are not used.



Best parameters for aspect 'direction': {'clf__learning_rate': 0.2, 'clf__max_depth': 7, 'clf__n_estimators': 300, 'clf__subsample': 1.0}
Average 10-fold CV test Accuracy for aspect 'direction': 0.5367
Average 10-fold CV train Accuracy for aspect 'direction': 0.8699
Final Training Accuracy: 0.9465
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.76      0.85       568
           1       0.94      1.00      0.97      3128
           2       0.95      0.77      0.85       245

    accuracy                           0.95      3941
   macro avg       0.95      0.84      0.89      3941
weighted avg       0.95      0.95      0.94      3941

Test Accuracy: 0.8367
Test Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.32      0.39       124
           1       0.88      0.95      0.91       804
           2       0.60      0.43      0.50        58

    accuracy

Parameters: { "use_label_encoder" } are not used.



Best parameters for aspect 'Music': {'clf__learning_rate': 0.2, 'clf__max_depth': 7, 'clf__n_estimators': 200, 'clf__subsample': 1.0}
Average 10-fold CV test Accuracy for aspect 'Music': 0.6561
Average 10-fold CV train Accuracy for aspect 'Music': 0.9186
Final Training Accuracy: 0.9838
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.80      0.88        89
           1       0.99      1.00      0.99      3446
           2       0.97      0.89      0.93       406

    accuracy                           0.98      3941
   macro avg       0.98      0.90      0.93      3941
weighted avg       0.98      0.98      0.98      3941

Test Accuracy: 0.9604
Test Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.28      0.42        18
           1       0.98      0.99      0.98       867
           2       0.81      0.83      0.82       101

    accuracy            

Parameters: { "use_label_encoder" } are not used.



Best parameters for aspect 'ovr_sent': {'clf__learning_rate': 0.2, 'clf__max_depth': 7, 'clf__n_estimators': 300, 'clf__subsample': 1.0}
Average 10-fold CV test Accuracy for aspect 'ovr_sent': 0.5808
Average 10-fold CV train Accuracy for aspect 'ovr_sent': 0.8567
Final Training Accuracy: 0.8886
Final Training Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.89      0.89      1346
           1       0.93      0.64      0.76       567
           2       0.88      0.96      0.92      2028

    accuracy                           0.89      3941
   macro avg       0.90      0.83      0.86      3941
weighted avg       0.89      0.89      0.88      3941

Test Accuracy: 0.7079
Test Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.68      0.70       348
           1       0.50      0.23      0.32       139
           2       0.73      0.86      0.79       499

    accuracy   