In [9]:
pip install lime




In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lime.lime_text import LimeTextExplainer
import numpy as np



A1. Implement stacking classifier or regressor (depending on your project problem). The base
models should be the list of classifiers / regressors already implemented. Experiment with various
metamodels (final_estimator). Use above references 1 & 2.

In [11]:
#Load the dataset
df = pd.read_excel('/content/bert_features_output.xlsx')

In [None]:

#Function to define base classifiers
def get_base_classifiers():
    return [
        ('KNN', KNeighborsClassifier()),
        ('DT', DecisionTreeClassifier(random_state=42)),
        ('RF', RandomForestClassifier(n_estimators=100, random_state=42))
    ]


#Function to define various meta models
def get_meta_models():
    return {
        "LR": LogisticRegression(max_iter=1000),
        "SVC": SVC(probability=True),
        "Gradient Boosting": GradientBoostingClassifier(),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }


#function for evaluting model
def evaluate_model(y_true, y_pred):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))


#trainning the function using stack classifier
def train_and_evaluate_stack_models(X_train, X_test, y_train, y_test, base_classifiers, meta_models):
    for name, meta_model in meta_models.items():
        print(f"\n=== Meta Model: {name} ===")

        stack_model = StackingClassifier(
            estimators=base_classifiers,
            final_estimator=meta_model,
            cv=5,
            passthrough=True
        )

        # Train
        stack_model.fit(X_train, y_train)

        # Predict
        y_pred = stack_model.predict(X_test)

        # Evaluate
        evaluate_model(y_test, y_pred)

#taking the feautures
X = df.drop(['label','text_'], axis=1)
y = df['label'] #target feauture

#splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Get models
base_classifiers = get_base_classifiers()
meta_models = get_meta_models()

# Train and evaluate
train_and_evaluate_stack_models(X_train, X_test, y_train, y_test, base_classifiers, meta_models)





=== Meta Model: LR ===
Accuracy: 0.808
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      1008
           1       0.80      0.81      0.81       992

    accuracy                           0.81      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.81      0.81      0.81      2000


=== Meta Model: SVC ===
Accuracy: 0.8045
              precision    recall  f1-score   support

           0       0.80      0.82      0.81      1008
           1       0.81      0.79      0.80       992

    accuracy                           0.80      2000
   macro avg       0.80      0.80      0.80      2000
weighted avg       0.80      0.80      0.80      2000


=== Meta Model: Gradient Boosting ===
Accuracy: 0.771
              precision    recall  f1-score   support

           0       0.77      0.78      0.77      1008
           1       0.77      0.77      0.77       992

    accuracy                           0.77  

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7685
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      1008
           1       0.77      0.75      0.76       992

    accuracy                           0.77      2000
   macro avg       0.77      0.77      0.77      2000
weighted avg       0.77      0.77      0.77      2000



A2. Implement pipeline to allow multiple steps of data processing and classification to be executed
simultaneously. Use reference 3 above for pipeline construction and execution.

In [12]:
#Function to define base classifiers
def get_base_classifiers():
    return [
        ('KNN', KNeighborsClassifier()),
        ('DT', DecisionTreeClassifier(random_state=42)),
        ('RF', RandomForestClassifier(n_estimators=100, random_state=42))
    ]


#Function to define various meta models
def get_meta_models():
    return {
        "LR": LogisticRegression(max_iter=1000),
        "SVC": SVC(probability=True),
        "Gradient Boosting": GradientBoostingClassifier(),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }

#Construct a pipeline with preprocessing and stacking classifier
def build_pipeline(base_classifiers, meta_model):
    stack_clf = StackingClassifier(
        estimators=base_classifiers,
        final_estimator=meta_model,
        cv=5,
        passthrough=True
    )

    pipeline = Pipeline([
        ('scaler', StandardScaler()),      # normalize data
        ('classifier', stack_clf)          # Final estimator
    ])
    return pipeline


#trainning and evaluating pipeline
def train_and_evaluate_pipeline(pipeline, X_train, X_test, y_train, y_test, model_name):

    print(f"\n=== Pipeline with Meta Model: {model_name} ===")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

#taking the feautures
X = df.drop(['label','text_'], axis=1)
y = df['label'] #target feauture

#splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get models
base_classifiers = get_base_classifiers()
meta_models = get_meta_models()

# Run pipelines with different meta models
for name, meta_model in meta_models.items():
    pipeline = build_pipeline(base_classifiers, meta_model)
    train_and_evaluate_pipeline(pipeline, X_train, X_test, y_train, y_test, name)



=== Pipeline with Meta Model: LR ===
Accuracy: 0.811
              precision    recall  f1-score   support

           0       0.82      0.81      0.81      1008
           1       0.81      0.81      0.81       992

    accuracy                           0.81      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.81      0.81      0.81      2000


=== Pipeline with Meta Model: SVC ===
Accuracy: 0.818
              precision    recall  f1-score   support

           0       0.82      0.83      0.82      1008
           1       0.82      0.81      0.82       992

    accuracy                           0.82      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.82      0.82      0.82      2000


=== Pipeline with Meta Model: Gradient Boosting ===
Accuracy: 0.7765
              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1008
           1       0.77      0.78      0.78       992

   

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7705
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      1008
           1       0.77      0.77      0.77       992

    accuracy                           0.77      2000
   macro avg       0.77      0.77      0.77      2000
weighted avg       0.77      0.77      0.77      2000



A3. Using LIME explainer, explain the outcomes of pipeline.

In [16]:
def explain_with_lime_real_text(pipeline, df, X, text_column='text_', index=0):
    text_sample = df.iloc[index][text_column]
    label_sample = df.iloc[index]['label']

    print(f"\nExplaining sample #{index}")
    print("Text:", text_sample)
    print("True label:", label_sample)

    class_names = [0, 1]

    explainer = LimeTextExplainer(class_names=class_names)

    # LIME expects a function that takes raw text → proba
    def predict_fn(texts):
        # Map input texts to their corresponding feature vectors
        # For demo: Assume index order of df['text_'] matches order in X
        temp_features = []
        for text in texts:
            idx = df[df[text_column] == text].index
            if len(idx) == 0:
                temp_features.append(np.zeros((X.shape[1],)))  # fallback
            else:
                temp_features.append(X.loc[idx[0]].values)
        return pipeline.predict_proba(np.array(temp_features))

    exp = explainer.explain_instance(
        text_sample,
        predict_fn,
        num_features=15,
        top_labels=1
    )

    exp.save_to_file('lime_explanation.html')
    print(" LIME explanation saved to lime_explanation.html")


pipeline = build_pipeline(base_classifiers, meta_models["Gradient Boosting"])
train_and_evaluate_pipeline(pipeline, X_train, X_test, y_train, y_test, "Gradient Boosting")

# Now explain a sample
explain_with_lime_real_text(pipeline, df, X, index=5)  # You can change the index



=== Pipeline with Meta Model: Gradient Boosting ===
Accuracy: 0.776
              precision    recall  f1-score   support

           0       0.78      0.77      0.78      1008
           1       0.77      0.78      0.77       992

    accuracy                           0.78      2000
   macro avg       0.78      0.78      0.78      2000
weighted avg       0.78      0.78      0.78      2000


Explaining sample #5
Text: wanted different flavors
True label: 0




 LIME explanation saved to lime_explanation.html
