# 1. Imports and Data Loading
Import all necessary libraries and load the CSV into a DataFrame.



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

df = pd.read_csv('AHMAD ZAKARIYA - ML_Internship_dataset_One.csv')
df.head()


Unnamed: 0,Legacy_Customer_ID,Age,Employment_Type,Education_Level,Annual_Income($),Credit_Score,Region,Account_Type,Customer_Feedback,CLV_Score,Complaint_Count,Contact_Preference,Subscription_Tier,Target
0,,35.0,Self-Employed,Master,8238.15,,Southwest,Checking,Excellent customer service experience,17.1,0,Text,Basic,0
1,,35.0,Freelancer,High School,5327.59,827.015439,Midwest,Checking,Loyalty program privileges are profitable,8.6,0,Phone,Basic,1
2,,28.0,Self-Employed,Associate,78780.2,803.283954,West Coast,Checking,Return process was difficult,30.2,1,Email,Basic,1
3,,50.0,Salaried,Bachelor,287709.86,756.546772,Midwest,Savings,Delivery was deferred by 3 days,29.1,0,Email,Enterprise,0
4,,42.0,Salaried,Associate,59381.4,702.846203,Midwest,Savings,Highly recommend this service to colleagues,48.7,0,Email,Premium,0


# 2. Define Column Lists
Specify which columns are IDs, text, target, numeric features, and categorical features.


In [3]:
id_col = 'Legacy_Customer_ID'
text_col = 'Customer_Feedback'
target_col = 'Target'

numeric_cols = [
    'Age',
    'Annual_Income($)',
    'Credit_Score',
    'CLV_Score',
    'Complaint_Count'
]
categorical_cols = [
    'Employment_Type',
    'Education_Level',
    'Region',
    'Account_Type',
    'Contact_Preference',
    'Subscription_Tier'
]


# 3. Raw Data Model
- Drop any row with missing values  
- Ordinal-encode categoricals  
- Train/Test split  
- Train a Random Forest and predict  


In [4]:
raw_df = df.dropna(subset=numeric_cols + categorical_cols + [target_col])
X_raw = raw_df.drop(columns=[id_col, text_col, target_col])
y_raw = raw_df[target_col]

ord_enc = OrdinalEncoder()
X_raw[categorical_cols] = ord_enc.fit_transform(X_raw[categorical_cols])

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_raw, y_raw,
    test_size=0.3,
    stratify=y_raw,
    random_state=42
)


clf_raw = RandomForestClassifier(random_state=42)
clf_raw.fit(X_train_r, y_train_r)

y_pred_r = clf_raw.predict(X_test_r)
y_prob_r = clf_raw.predict_proba(X_test_r)[:, 1]


# 4. Preprocessed Data Model
- Impute numeric with median + standard scale  
- Impute categorical with most-frequent + one-hot encode  
- Pipeline + Random Forest  


In [5]:
X = df.drop(columns=[id_col, text_col, target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

clf_pre = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

clf_pre.fit(X_train, y_train)
y_pred_p = clf_pre.predict(X_test)
y_prob_p = clf_pre.predict_proba(X_test)[:, 1]


# 5. Compute and Display Metrics
Define a helper to compute all key metrics, then compare both models.


In [6]:
def compute_metrics(y_true, y_pred, y_prob):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'Recall': recall_score(y_true, y_pred, zero_division=0),
        'F1-score': f1_score(y_true, y_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_true, y_prob)
    }

metrics_raw = compute_metrics(y_test_r, y_pred_r, y_prob_r)
metrics_pre = compute_metrics(y_test, y_pred_p, y_prob_p)

results = pd.DataFrame(
    [metrics_raw, metrics_pre],
    index=['Raw Data Model', 'Preprocessed Model']
)
print(results)


                    Accuracy  Precision    Recall  F1-score   ROC-AUC
Raw Data Model      0.595294   0.484848  0.188235  0.271186  0.521396
Preprocessed Model  0.574000   0.419087  0.168333  0.240190  0.517640


# END