In [44]:
# ================================
# Customer Acquisition Pipeline (Auto ML)
# Works for Classification & Regression
# ================================

import pandas as pd
import numpy as np

# ML imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, classification_report,
    mean_squared_error, mean_absolute_error, r2_score
)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    RandomForestRegressor, GradientBoostingRegressor
)
from xgboost import XGBClassifier, XGBRegressor



In [45]:
# -----------------------------
# 1. Load Dataset
# -----------------------------
df = pd.read_csv("customer_acquisition_data.csv")

# 👇 Change this to your actual target column name
target_col = "revenue" # use columns "revenue" or "channel" or any column that u what as target

# Features (X) and Target (y)
X = df.drop(columns=[target_col])
y = df[target_col]



In [46]:
# -----------------------------
# 2. Detect Problem Type (Classification vs Regression)
# -----------------------------
is_classification = False

# If target is categorical or has <= 20 unique values → Classification
if y.dtype == "object" or len(y.unique()) <= 20:
    is_classification = True
    le = LabelEncoder()
    y = le.fit_transform(y)  # Encode categorical target into numbers

print("Detected Task Type:", "Classification" if is_classification else "Regression")



Detected Task Type: Regression


In [47]:
# -----------------------------
# 3. Train/Test Split
# -----------------------------
# Use stratify for classification to preserve class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [48]:
# -----------------------------
# 4. Handle Categorical & Numeric Features
# -----------------------------
# Convert categorical features into dummy/one-hot encoding
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Ensure train and test sets have the same columns
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

# Scale numerical features (helps models like Logistic Regression / Gradient Boosting)
scaler = StandardScaler()
X_train[X_train.columns] = scaler.fit_transform(X_train)
X_test[X_test.columns] = scaler.transform(X_test)



In [49]:
# -----------------------------
# 5. Define Models
# -----------------------------
if is_classification:
    # Classification Models
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
    }
else:
    # Regression Models
    models = {
        "Linear Regression": LinearRegression(),
        "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(random_state=42),
        "XGBoost": XGBRegressor(random_state=42)
    }

results = {}       # To store results of all models
best_model = None  # Best performing model
# Initialize best score depending on problem type
best_score = -np.inf if is_classification else np.inf



In [50]:
# -----------------------------
# 6. Train & Evaluate Models
# -----------------------------
for name, model in models.items():
    model.fit(X_train, y_train)        # Train model
    y_pred = model.predict(X_test)     # Predict on test set

    if is_classification:
        # If classifier supports probabilities, get predicted probabilities
        y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

        # Compute metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")
        roc = roc_auc_score(y_test, y_prob, multi_class='ovr') if y_prob is not None else 0

        results[name] = {"Accuracy": acc, "F1-score": f1, "ROC-AUC": roc}

        # Choose best model by highest F1-score
        if f1 > best_score:
            best_score = f1
            best_model = (name, model)

    else:  # Regression
        # Compute metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        results[name] = {"RMSE": rmse, "MAE": mae, "R2": r2}

        # Choose best model by lowest RMSE
        if rmse < best_score:
            best_score = rmse
            best_model = (name, model)

In [51]:
# -----------------------------
# 7. Display Results
# -----------------------------
print("📊 Model Comparison:")
for name, metrics in results.items():
    print(f"{name}: {metrics}")

print("\n✅ Best Model Selected:", best_model[0])

if is_classification:
    print("\nClassification Report for Best Model:")
    print(classification_report(y_test, best_model[1].predict(X_test)))
else:
    print("\nRegression Metrics for Best Model:")
    y_pred = best_model[1].predict(X_test)
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("R2:", r2_score(y_test, y_pred))



📊 Model Comparison:
Linear Regression: {'RMSE': np.float64(1298.6357537881174), 'MAE': 1102.9093163273144, 'R2': -0.02327369688770098}
Random Forest: {'RMSE': np.float64(1429.2851797344144), 'MAE': 1154.07075, 'R2': -0.2395238302243281}
Gradient Boosting: {'RMSE': np.float64(1331.2187430787224), 'MAE': 1115.531732159297, 'R2': -0.07526608261716006}
XGBoost: {'RMSE': np.float64(1463.395196110743), 'MAE': 1183.6019287109375, 'R2': -0.2993924617767334}

✅ Best Model Selected: Linear Regression

Regression Metrics for Best Model:
RMSE: 1298.6357537881174
MAE: 1102.9093163273144
R2: -0.02327369688770098


In [52]:
# -----------------------------
# 8. Test Best Model with Sample Data
# -----------------------------
# Pick 5 random samples from test set
sample = X_test.sample(5, random_state=42)

# Get predictions
predictions = best_model[1].predict(sample)

# Print input + prediction together
print("\n🔮 Sample Predictions (Best Model):")
sample_with_pred = sample.copy()
sample_with_pred["Predicted_Output"] = predictions
print(sample_with_pred)



🔮 Sample Predictions (Best Model):
     customer_id      cost  conversion_rate  channel_paid advertising  \
204    -0.841490 -0.351088         1.351770                 -0.555692   
208    -0.824022 -0.475826         0.606846                 -0.555692   
532     0.590938 -0.788592        -0.722581                 -0.555692   
368    -0.125276 -0.351088         1.351770                 -0.555692   
628     1.010186 -0.788592        -0.722581                 -0.555692   

     channel_referral  channel_social media  Predicted_Output  
204         -0.584567              1.791794       2601.832593  
208          1.710667             -0.558100       2706.975115  
532         -0.584567             -0.558100       2917.576875  
368         -0.584567              1.791794       2651.707530  
628         -0.584567             -0.558100       2946.771960  


In [53]:
sample_with_pred

Unnamed: 0,customer_id,cost,conversion_rate,channel_paid advertising,channel_referral,channel_social media,Predicted_Output
204,-0.84149,-0.351088,1.35177,-0.555692,-0.584567,1.791794,2601.832593
208,-0.824022,-0.475826,0.606846,-0.555692,1.710667,-0.5581,2706.975115
532,0.590938,-0.788592,-0.722581,-0.555692,-0.584567,-0.5581,2917.576875
368,-0.125276,-0.351088,1.35177,-0.555692,-0.584567,1.791794,2651.70753
628,1.010186,-0.788592,-0.722581,-0.555692,-0.584567,-0.5581,2946.77196
