# Advertisement Click Prediction Modeling

This notebook focuses on developing and evaluating machine learning models to predict advertisement clicks based on the provided features.

## Table of Contents
1. [Data Loading and Preparation](#data-loading)
2. [Model Development](#model-development)
3. [Model Evaluation](#model-evaluation)
4. [Hyperparameter Optimization](#hyperparameter-optimization)
5. [Final Model and Predictions](#final-model)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


## Data Loading and Preparation

In [None]:
# Load the imputed data
df = pd.read_csv('data_train_imputed.csv')

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)


## Model Development

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train_balanced, y_train_balanced)
    y_pred = model.predict(X_test_scaled)
    results[name] = {
        'accuracy': model.score(X_test_scaled, y_test),
        'roc_auc': roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1]),
        'classification_report': classification_report(y_test, y_pred)
    }


## Model Evaluation

In [None]:
# Compare model performances
performance_df = pd.DataFrame({
    name: {
        'Accuracy': results[name]['accuracy'],
        'ROC AUC': results[name]['roc_auc']
    }
    for name in models.keys()
}).T

# Plot results
plt.figure(figsize=(10, 6))
performance_df.plot(kind='bar')
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()


## Hyperparameter Optimization

In [None]:
# Define parameter grids for the best performing model
# Example for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

grid_search.fit(X_train_balanced, y_train_balanced)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


## Final Model and Predictions

In [None]:
# Train final model with best parameters
final_model = RandomForestClassifier(**grid_search.best_params_, random_state=42)
final_model.fit(X_train_balanced, y_train_balanced)

# Make predictions
y_pred_final = final_model.predict(X_test_scaled)
y_pred_proba_final = final_model.predict_proba(X_test_scaled)[:, 1]

# Print final results
print("Classification Report:")
print(classification_report(y_test, y_pred_final))

print("
ROC AUC Score:", roc_auc_score(y_test, y_pred_proba_final))

# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance.head(20), x='importance', y='feature')
plt.title('Top 20 Most Important Features')
plt.xlabel('Feature Importance')
plt.tight_layout()
