CTR PREDICTOR - AROHI BHATNAGAR

In [1]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import joblib

In [2]:
# Load the dataset
df=pd.read_csv('Dataset.csv')
print(f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns.")

# Preview
df.head()

Loaded dataset with 72612 rows and 35 columns.


Unnamed: 0,campaign_item_id,no_of_days,time,ext_service_id,ext_service_name,creative_id,creative_width,creative_height,search_tags,template_id,...,exchange_rate,media_cost_usd,position_in_content,unique_reach,total_reach,search_tag_cat,cmi_currency_code,timezone,weekday_cat,keywords
0,2733,7,2022-05-01,128,Facebook Ads,1000,300.0,250.0,#The Power of X,90.0,...,1,14.058514,,,,Others,SGD,Asia/Singapore,week_end,delicate bracelets
1,2733,8,2022-05-02,16,DV360,1000,300.0,250.0,#Be Bold. Be X,90.0,...,1,99.633496,,,,Others,SGD,Asia/Singapore,week_day,summer jewelry
2,2733,9,2022-05-03,128,Facebook Ads,1000,300.0,250.0,#Embrace Your Individuality with X,90.0,...,1,109.419677,,,,Others,SGD,Asia/Singapore,week_day,artisan jewelry
3,2733,10,2022-05-04,128,Facebook Ads,1000,300.0,250.0,#Be Bold. Be X,90.0,...,1,115.209499,,,,Others,SGD,Asia/Singapore,week_day,layered bracelets
4,2733,11,2022-05-05,4,Google Ads,1000,300.0,250.0,#Be Bold. Be X,90.0,...,1,66.990104,,,,Others,SGD,Asia/Singapore,week_day,minimalist jewelry


In [3]:
# Check column and choose correct target
print(df.columns.tolist())

['campaign_item_id', 'no_of_days', 'time', 'ext_service_id', 'ext_service_name', 'creative_id', 'creative_width', 'creative_height', 'search_tags', 'template_id', 'landing_page', 'advertiser_id', 'advertiser_name', 'network_id', 'approved_budget', 'advertiser_currency', 'channel_id', 'channel_name', 'max_bid_cpm', 'network_margin', 'campaign_budget_usd', 'impressions', 'clicks', 'stats_currency', 'currency_code', 'exchange_rate', 'media_cost_usd', 'position_in_content', 'unique_reach', 'total_reach', 'search_tag_cat', 'cmi_currency_code', 'timezone', 'weekday_cat', 'keywords']


In [4]:
# We know by kaggle data code definition that 'clicks' is the target variable

TARGET_COL = 'clicks'
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in dataset columns.")
else:
    print(f"Target column '{TARGET_COL}' found.")

Target column 'clicks' found.


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score
)
import numpy as np

# --- Preprocess and Train the Model ---

# Drop rows where target is missing
df = df.dropna(subset=[TARGET_COL])

y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

# Identify column types
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print(f"Numeric Features: {len(num_cols)}, Categorical Features: {len(cat_cols)}")

# Remove numeric columns with all NaN
num_cols = [c for c in num_cols if not X[c].isna().all()]

# Numeric transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Low-cardinality categoricals only
low_card = [c for c in cat_cols if df[c].nunique() <= 10]

cat_low = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

# Preprocessor
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat_low', cat_low, low_card),
])

# Model
clf = RandomForestClassifier(
    n_estimators=50,
    max_depth=15,
    random_state=42,
    class_weight='balanced'
)

# Pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', clf)
])

# Train/Test Split (no stratify)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the model
pipe.fit(X_train, y_train)

# Predict
y_pred = pipe.predict(X_test)

# Evaluate
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, average='weighted'),
    'Recall': recall_score(y_test, y_pred, average='weighted'),
    'F1 Score': f1_score(y_test, y_pred, average='weighted')
}

print("\n✅ Evaluation Metrics Completed:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")


Numeric Features: 20, Categorical Features: 14

✅ Evaluation Metrics Completed:
Accuracy: 0.0476
Precision: 0.0759
Recall: 0.0476
F1 Score: 0.0527


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [6]:
import joblib
import json
from pathlib import Path

output_dir = Path('model_output')
output_dir.mkdir(parents=True, exist_ok=True)

# Save the model with compression (level 3 is typically a good balance)
joblib.dump(pipe, output_dir / 'ctr_model.pkl', compress=3)

# Feature Info
feature_info = []
for c in X.columns:
    info = {"name": c}
    if c in num_cols:
        info["type"] = "numeric"
    else:
        info["type"] = "categorical"
        info["sample_values"] = list(map(str, X[c].dropna().unique()[:20]))
    feature_info.append(info)

# Save feature information as JSON
with open(output_dir / 'feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=4)

print("Model and metadata saved with compression.")


Model and metadata saved with compression.
