Week 1 : Data Analysis and Cleaning
Week 2 : Feature Engineering and Basic Modeling
Week 3 : Advance Modeling
Week 4 : Deployment

Week 1 : Day 1 -> Choose a dataset from online sources like kaggle
                Dataset choosen : https://www.kaggle.com/datasets/rahulchavan99/marketing-campaign-dataset

Week 1 : Day 2 -> Explore the Dataset and compute basic statistics

In [1]:
# Importing all necessary libraries
import pandas as pd
import IPython.display as display
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Displaying Maximum rows and columns
# Display all columns of the dataframe
pd.options.display.max_columns=None

# Display all rows of the dataframe
pd.options.display.max_rows=None

# Set the plot size
plt.rcParams['figure.figsize']=[12,8]

In [None]:
# Read the Dataset from the specified file path
df=pd.read_csv('Marketing campaign dataset.csv')
df.head()

In [None]:
df.columns

In [None]:
# Number of rows and columns
df.shape

In [None]:
# Type for each column in dataset
df.info()

In [None]:
# Summary Statistics(mean,median,std,min,max)
df.describe()


Week 1 : Day 3 -> Perform EDA with visualizations to analyze patterns.

In [8]:
# CTR
df['CTR']=(df['clicks']/df['impressions'])*100

In [9]:
# Convert 'time' column to datetime
df['time'] = pd.to_datetime(df['time'],format='%d-%m-%Y',errors='coerce')

In [None]:
# Visualizing the CTR VS Time Duration

plt.figure(figsize=(10, 6))
sns.lineplot(x='time', y='CTR', data=df, color='#FF6361')
plt.title('CTR over Time (Full Duration)')
plt.xlabel('Time')
plt.ylabel('CTR')
plt.xticks(rotation=45)

# CTR over last 7 days
last_7_days = pd.to_datetime(df['time'].max()) - pd.DateOffset(days=7)
df_last_7_days = df[df['time'] >= last_7_days]

plt.figure(figsize=(10, 6))
sns.lineplot(x='time', y='CTR', data=df_last_7_days, color='#58508D')
plt.title('CTR over Time (Last 7 Days)')
plt.xlabel('Time')
plt.ylabel('CTR')
plt.xticks(rotation=45)

plt.show()

In [None]:
# Scatter plot to visualize relationship between impressions and clicks
plt.figure(figsize=(10, 6))
sns.scatterplot(x='impressions', y='clicks', data=df, color='green')
plt.title('Impressions vs Clicks')
plt.xlabel('Impressions')
plt.ylabel('Clicks')
plt.show()

# Correlation between impressions and clicks
correlation = df[['impressions', 'clicks']].corr()
print(correlation)


In [None]:
# Boxplot to show CTR distribution across different budget ranges
plt.figure(figsize=(10, 6))
sns.boxplot(x='campaign_budget_usd', y='CTR', data=df)
plt.title('CTR vs Campaign Budget (USD)')
plt.xlabel('Campaign Budget (USD)')
plt.ylabel('CTR')
plt.show()


In [None]:
# Boxplot of CTR across different advertisers
plt.figure(figsize=(12, 6))
sns.boxplot(x='advertiser_name', y='CTR', data=df)
plt.title('CTR Distribution by Advertiser')
plt.xlabel('Advertiser')
plt.ylabel('CTR')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Calculate correlation matrix
corr_matrix = df[['CTR', 'impressions', 'clicks', 'campaign_budget_usd', 'media_cost_usd']].corr()

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Boxplot to detect outliers in CTR
plt.figure(figsize=(10, 6))
sns.boxplot(x='CTR', data=df)
plt.title('Boxplot for CTR to Detect Outliers')
plt.show()


Week 1 : Day 4 -> Handle missing values and address outliers

In [None]:
# Identifying Missing Values : Checking for missing values in each column
total_missing=df.isnull().sum()
# Calculate the percentage of missing values for each variable
percent_missing=(total_missing*100)/df.isnull().count()
# Round off
percent_missing=percent_missing.round(decimals=0)

# Create a dataframe for all missing values
missing_data=pd.DataFrame({'Total':total_missing,
                           'Percentage':percent_missing,
                           'Type':df.dtypes})

# Sort the Dataframe based on the total count of missing values in descending order
missing_data=missing_data.sort_values(by='Total',ascending=False)

missing_data

In [None]:
# filling missing values with 0 for following columns 

df['creative_width'].fillna(0, inplace=True)
df['creative_height'].fillna(0, inplace=True)
df['template_id'].fillna(0, inplace=True)
df['approved_budget'].fillna(0, inplace=True)

# dropping where more than 80% values are missing
df.drop(columns=['position_in_content','unique_reach','total_reach','max_bid_cpm'],inplace=True)

df.describe()

Week 1 : Day 5 -> Clean the Dataset and prepare for feature engineering

In [None]:
# Check for any duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

In [19]:
# Convert columns to appropriate data types
df['CTR'] = pd.to_numeric(df['CTR'], errors='coerce')
df['campaign_budget_usd'] = pd.to_numeric(df['campaign_budget_usd'], errors='coerce')

# Convert time-based columns to datetime format
df['time'] = pd.to_datetime(df['time'],format='%d-%m-%Y',errors='coerce')  # If 'time' is present and in string format


Week 2 : Day 1 -> Encode Catrgorical Features(eg - one hot encoding)

Categorical variables should be properly encoded for machine learning. You can either use label encoding or one-hot encoding depending on the model you plan to use.

In [None]:
# One-Hot Encoding for the 'advertiser_name' column
# df = pd.get_dummies(df, columns=['advertiser_name'], drop_first=True)
# Check for leading/trailing spaces in column names
df.columns = df.columns.str.strip()  # This removes any extra spaces
print(df.columns)

# Verify the encoding result
print(df.head())


In [21]:
# One-hot encoding for categorical columns like 'network_id', 'advertiser_name', etc.
df = pd.get_dummies(df, columns=['network_id', 'advertiser_name', 'campaign_item_id'], drop_first=True)


Week 2 : Day 2 -> Scale/Normalize numerical features and generate interaction terms

In [22]:
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
df[['campaign_budget_usd', 'impressions', 'clicks']] = scaler.fit_transform(df[['campaign_budget_usd', 'impressions', 'clicks']])


Week 2 : Day 3 -> Split Data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

# Step 1: Separate features (X) and target (y)
X = df.drop(columns=['CTR'])  # Assuming 'CTR' is the target column
y = df['CTR']  # Target column (CTR)

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the size of the training and testing sets
print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")


In [None]:
print(X_train.dtypes)
# df.columns = df.columns.str.strip()
# print(df['time'].dtype)  # Should output 'datetime64[ns]'
# # If time is already in datetime64 format, we can directly extract features
# df['year'] = df['time'].dt.year
# df['month'] = df['time'].dt.month
# df['day'] = df['time'].dt.day
# df = df.drop(columns=['time'])
# print(df[['time']].head())

In [25]:
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day

In [None]:
if 'time' in X_train.columns:
    X_train = X_train.drop(columns=['time'])

# Verify that 'time' is dropped
print(X_train.dtypes)  # Check that 'time' is no longer present

In [None]:
# Verify the split
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")



Week 2 : Day 4 -> Train a basic model and evaluate using MSE

Model Training with Logistic Regression

In [None]:
# Check for any non-numeric columns that may have been missed
categorical_columns = X_train.select_dtypes(include=['object']).columns
print("Categorical columns in the training set:")
print(categorical_columns)

# One-hot encoding all categorical columns in the dataset (if not done yet)
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Check if any columns are missing in X_test after encoding
missing_cols = set(X_train.columns) - set(X_test.columns)

# Add missing columns to X_test with 0 values
for col in missing_cols:
    X_test[col] = 0

# Ensure the columns match between training and test sets
X_train, X_test = X_train.align(X_test, join='left', axis=1)



In [None]:
# Initialize the Linear Regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Week 2 : Day 5 -> Mid Evaluation - Present the cleaned dataset, feature engineering, and initial model performance

Week 3 : Day 1 -> Train Advanced models (EG: decision tree, random forest)

In [None]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Initialize the DecisionTree Regressor
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model using Mean Squared Error and R²
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print(f"Decision Tree Mean Squared Error: {mse_dt}")
print(f"Decision Tree R² Score: {r2_dt}")

from sklearn.tree import plot_tree

# # Plot the tree
# plt.figure(figsize=(12, 8))
# plot_tree(dt_model, filled=True, feature_names=X_train.columns, class_names=['CTR'])
# plt.title('Decision Tree Visualization')
# plt.show()



In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=50,max_depth=10,n_jobs=-1,max_features='sqrt',random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model using Mean Squared Error and R²
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Mean Squared Error: {mse_rf}")
print(f"Random Forest R² Score: {r2_rf}")
# Get feature importance from Random Forest model
importances_rf = rf_model.feature_importances_

# Create a DataFrame to display the feature importances
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances_rf})

# Sort the features by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Display the top features
print(feature_importances.head())
# # Visualize the feature importance
# plt.figure(figsize=(12, 8))
# sns.barplot(x='Importance', y='Feature', data=feature_importances)
# plt.title('Random Forest Feature Importance')
# plt.show()


Week 3 : Day 2 -> Introduce and train XGBoost or LightGBM models.

In [None]:
print(pd.__version__)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd  # Ensure pandas is imported if necessary (although not used for Int64Index here)

# Create the DMatrix objects for XGBoost
train_data = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
test_data = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)


# Set parameters for XGBoost model
params = {
    'objective': 'reg:squarederror',  # Regression task (MSE loss)
    'eval_metric': 'rmse',            # Root mean square error (RMSE) as eval metric
    'max_depth': 6,                   # Max depth of each tree
    'learning_rate': 0.1,             # Learning rate
    'colsample_bytree': 0.8,          # Fraction of features to use
    'subsample': 0.8                  # Fraction of samples to use
}

# List of evaluation data
evals = [(train_data, 'train'), (test_data, 'eval')]

# Train the model with early stopping
bst = xgb.train(
    params, 
    train_data,
    num_boost_round=1000,          # Maximum boosting rounds
    evals=evals,
    early_stopping_rounds=10       # Stop early if no improvement for 10 rounds
)

# Make predictions
y_pred_xgb = bst.predict(test_data)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Print evaluation results
print(f"XGBoost Mean Squared Error: {mse_xgb}")
print(f"XGBoost R² Score: {r2_xgb}")


In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Clean column names to remove special characters and spaces
X_train.columns = X_train.columns.str.replace(r'[^a-zA-Z0-9]', '_', regex=True)
X_test.columns = X_test.columns.str.replace(r'[^a-zA-Z0-9]', '_', regex=True)

# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Set parameters for LightGBM model
params = {
    'objective': 'regression',    # For regression tasks
    'metric': 'l2',               # Metric for evaluation (L2 loss = MSE)
    'boosting_type': 'gbdt',      # Gradient Boosting Decision Tree
    'num_leaves': 31,             # Number of leaves in one tree
    'learning_rate': 0.1,         # Learning rate
    'feature_fraction': 0.8,      # Fraction of features to use per tree
    'bagging_fraction': 0.8,      # Fraction of samples to use per tree
    'bagging_freq': 5,            # Frequency of bagging
    'verbose': -1                 # Suppress output
}

# Train the model without early stopping and evals_result
lgb_model = lgb.train(
    params,                      # Model parameters
    train_data,                  # Training data
    num_boost_round=1000,        # Max number of boosting rounds
    valid_sets=[test_data],      # Validation set for evaluation
    valid_names=['test']         # Name for the validation dataset (optional)
)

# Check the best iteration from the training process
print(f"Best iteration: {lgb_model.best_iteration}")

# Predict using the best iteration found during training
y_pred_lgb = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

# Evaluate the model
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

# Print evaluation results
print(f"LightGBM Mean Squared Error: {mse_lgb}")
print(f"LightGBM R² Score: {r2_lgb}")


Week 3 : Day 3 -> Evaluate all models using Log Loss and ROC-AUC

In [None]:
# Let's assume a threshold of 0.5 for CTR
threshold = 0.5
y_train_binary = (y_train > threshold).astype(int)
y_test_binary = (y_test > threshold).astype(int)


from sklearn.metrics import log_loss, roc_auc_score

# Function to calculate Log Loss and ROC-AUC
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Get predicted probabilities
    y_train_pred_proba = model.predict_proba(X_train)[:, 1]  # Probability of class 1
    y_test_pred_proba = model.predict_proba(X_test)[:, 1]    # Probability of class 1
    
    # Calculate Log Loss
    logloss_train = log_loss(y_train, y_train_pred_proba)
    logloss_test = log_loss(y_test, y_test_pred_proba)
    
    # Calculate ROC-AUC
    roc_auc_train = roc_auc_score(y_train, y_train_pred_proba)
    roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)
    
    return {
        'Log Loss (Train)': logloss_train,
        'Log Loss (Test)': logloss_test,
        'ROC-AUC (Train)': roc_auc_train,
        'ROC-AUC (Test)': roc_auc_test
    }

# Example for Logistic Regression
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train_binary)

# Evaluate the Logistic Regression model
logreg_eval = evaluate_model(logreg_model, X_train, y_train_binary, X_test, y_test_binary)
print("Logistic Regression Evaluation:", logreg_eval)

# Example for RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train_binary)

# Evaluate the Random Forest model
rf_eval = evaluate_model(rf_model, X_train, y_train_binary, X_test, y_test_binary)
print("Random Forest Evaluation:", rf_eval)

# Example for XGBoost
import xgboost as xgb

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(eval_metric='logloss')
xgb_model.fit(X_train, y_train_binary)

# Evaluate the XGBoost model
xgb_eval = evaluate_model(xgb_model, X_train, y_train_binary, X_test, y_test_binary)
print("XGBoost Evaluation:", xgb_eval)

# Example for LightGBM
import lightgbm as lgb

# Train the LightGBM model
lgb_model = lgb.LGBMClassifier(objective='binary', metric='binary_logloss')
lgb_model.fit(X_train, y_train_binary)

# Evaluate the LightGBM model
lgb_eval = evaluate_model(lgb_model, X_train, y_train_binary, X_test, y_test_binary)
print("LightGBM Evaluation:", lgb_eval)


Week 3: Day 4 -> Perform hyperparamter tuning for the best model

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import pkg_resources


# Define the model
model = XGBClassifier()

# Hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5],
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)


from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# Split data into training and testing sets (if not already done)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = XGBClassifier()

# Fit the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")
