# ML Evaluation: Raw vs Preprocessed Dataset (Train:Val:Test Splits)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
file_path = "AHMAD ZAKARIYA - ml_preprocessing_dataset_1000.csv"
df = pd.read_csv(file_path)

df = df.drop(columns=['Legacy_Customer_ID'])
X = df.drop(columns='Target')
y = df['Target']


In [4]:
X_train_70, X_temp_70, y_train_70, y_temp_70 = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
X_val_70, X_test_70, y_val_70, y_test_70 = train_test_split(X_temp_70, y_temp_70, test_size=0.50, random_state=42, stratify=y_temp_70)

X_train_80, X_temp_80, y_train_80, y_temp_80 = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
X_val_80, X_test_80, y_val_80, y_test_80 = train_test_split(X_temp_80, y_temp_80, test_size=0.50, random_state=42, stratify=y_temp_80)


In [5]:
def train_raw(X_train, X_val, X_test, y_train, y_val, y_test):
    train_clean = pd.concat([X_train, y_train], axis=1).dropna()
    val_clean = pd.concat([X_val, y_val], axis=1).dropna()
    test_clean = pd.concat([X_test, y_test], axis=1).dropna()
    
    X_train_clean = train_clean.drop(columns='Target')
    y_train_clean = train_clean['Target']
    X_val_clean = val_clean.drop(columns='Target')
    y_val_clean = val_clean['Target']
    X_test_clean = test_clean.drop(columns='Target')
    y_test_clean = test_clean['Target']
    
    categorical_features = X_train_clean.select_dtypes(include='object').columns.tolist()
    
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough')
    
    clf = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    clf.fit(X_train_clean, y_train_clean)
    
    return {
        'val_acc': accuracy_score(y_val_clean, clf.predict(X_val_clean)),
        'val_f1': f1_score(y_val_clean, clf.predict(X_val_clean)),
        'test_acc': accuracy_score(y_test_clean, clf.predict(X_test_clean)),
        'test_f1': f1_score(y_test_clean, clf.predict(X_test_clean))
    }

# Function for training with preprocessing
def build_pipeline(X):
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include='object').columns.tolist()
    
    if 'Customer_Feedback' in categorical_features:
        categorical_features.remove('Customer_Feedback')
    
    numeric_pipeline = make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler()
    )
    categorical_pipeline = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        OneHotEncoder(handle_unknown='ignore')
    )
    
    preprocessor = ColumnTransformer([
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)
    ])
    
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

def train_preprocessed(X_train, X_val, X_test, y_train, y_val, y_test):
    X_train = X_train.drop(columns=['Customer_Feedback'], errors='ignore')
    X_val = X_val.drop(columns=['Customer_Feedback'], errors='ignore')
    X_test = X_test.drop(columns=['Customer_Feedback'], errors='ignore')
    
    clf = build_pipeline(X_train)
    clf.fit(X_train, y_train)
    
    return {
        'val_acc': accuracy_score(y_val, clf.predict(X_val)),
        'val_f1': f1_score(y_val, clf.predict(X_val)),
        'test_acc': accuracy_score(y_test, clf.predict(X_test)),
        'test_f1': f1_score(y_test, clf.predict(X_test))
    }


# Run evaluation and print results

In [6]:
results_raw_70 = train_raw(X_train_70, X_val_70, X_test_70, y_train_70, y_val_70, y_test_70)
results_raw_80 = train_raw(X_train_80, X_val_80, X_test_80, y_train_80, y_val_80, y_test_80)
results_prep_70 = train_preprocessed(X_train_70, X_val_70, X_test_70, y_train_70, y_val_70, y_test_70)
results_prep_80 = train_preprocessed(X_train_80, X_val_80, X_test_80, y_train_80, y_val_80, y_test_80)

print("Raw 70:30 ->", results_raw_70)
print("Raw 80:20 ->", results_raw_80)
print("Preprocessed 70:30 ->", results_prep_70)
print("Preprocessed 80:20 ->", results_prep_80)


Raw 70:30 -> {'val_acc': 0.5887096774193549, 'val_f1': 0.2153846153846154, 'test_acc': 0.5746268656716418, 'test_f1': 0.17391304347826086}
Raw 80:20 -> {'val_acc': 0.5529411764705883, 'val_f1': 0.13636363636363635, 'test_acc': 0.6022727272727273, 'test_f1': 0.3137254901960784}
Preprocessed 70:30 -> {'val_acc': 0.5466666666666666, 'val_f1': 0.24444444444444444, 'test_acc': 0.64, 'test_f1': 0.34146341463414637}
Preprocessed 80:20 -> {'val_acc': 0.58, 'val_f1': 0.3, 'test_acc': 0.59, 'test_f1': 0.2545454545454545}
