In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV,GridSearchCV, GroupKFold, cross_val_score, cross_validate
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [None]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Data Loading
**Reading the train_data csv file**

In [None]:
first_df = pd.read_csv("/kaggle/input/engage-2-value-from-clicks-to-conversions/train_data.csv") 
test_df=pd.read_csv("/kaggle/input/engage-2-value-from-clicks-to-conversions/test_data.csv")

In [None]:
train_df = first_df.copy()

**EDA**

Performing exploratory data analysis

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.head(3)

In [None]:
test_df.head(3)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
train_df.nunique()

In [None]:
test_df.nunique()

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
categorical_cols = train_df.select_dtypes(include=['object', 'bool']).columns
for col in categorical_cols:
    print(f"{col}: {train_df[col].nunique()}")

In [None]:
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    print(f"{col}: {train_df[col].nunique()}")

In [None]:
train_df.columns

In [None]:
test_df.columns

In [None]:
print(train_df['purchaseValue'].unique())

In [None]:
print(train_df['purchaseValue'].value_counts())

# Graphical Analysis

**Purchase Value Distribution**

**Insight:**
The majority of purchaseValue entries are heavily skewed to the left (i.e., right-skewed).
There are a few extreme outliers.
These outliers may be due to:

(i) Logging errors

(ii) Accidental large purchases

(iii) Malformed data entries

In [None]:
sns.histplot(train_df['purchaseValue'], bins=50)
plt.title("Purchase Value Distribution")
plt.show()

**Purchase Value for each browser and count of each browsers**

**Insight:**
Chrome is the most used browser by users in this dataset.
Safari is next, but it has much fewer users than Chrome - indicating a significant iOS/macOS user base.
Browsers like Android WebView, Safari (in-app), Opera Mini, and Samsung Internet indicate strong mobile traffic.
Internet Explorer still appears in the top 10 despite being long outdated.

In [None]:
fig = plt.figure(figsize=(5,5))
top_browsers = train_df['browser'].value_counts().head(10)
plt.bar(range(len(top_browsers)), top_browsers.values, color='lightgreen', alpha=0.7)
plt.title('Top 10 Browsers', fontsize=12, fontweight='bold')
plt.xlabel('Browser')
plt.ylabel('Count')
plt.xticks(range(len(top_browsers)), top_browsers.index, rotation=90)

**Purchase Value by Browser**

**Insight:** Firefox shows the highest total purchase value, but the error bar is large, suggesting high variance. This might mean a few users contributed to very large purchases, which skews the sum.
Chrome has a high and consistent total purchase value, indicating a strong and steady customer base.
Edge, Internet Explorer, and Safari also contribute moderately to the total purchase value.
Browsers like Opera Mini, Android Webview, and Samsung Internet have negligible purchase contributions, indicating these platforms might not be ideal for targeting high-value users.

In [None]:
top_browsers = train_df['browser'].value_counts().head(10).index
sns.barplot(data=train_df[train_df['browser'].isin(top_browsers)],
            x='browser', y='purchaseValue')
plt.xticks(rotation=90)
plt.title("Purchase Value by Browser")
plt.show()

**Device type distribution**

**Insight:**
Majority of users access the platform via desktop.
This indicates that users prefer a full-screen experience, for easier navigation, and better visuals.
A significant group of users still use mobile. Thus, mobile optimization is important. One-fifth of traffic is mobile. A very small share of users access through tablets.

In [None]:
fig = plt.figure(figsize=(3,3))
device_counts = train_df['deviceType'].value_counts()
plt.pie(device_counts.values, labels=device_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Device Type Distribution', fontsize=12, fontweight='bold')

**Average Purchase Value by Device Type**

**Insight:** Desktop users have the highest average purchase value. Tablet users spend slightly more than mobile users.

In [None]:
plt.figure(figsize=(5, 5))
sns.barplot(data=train_df, x='deviceType', y='purchaseValue', estimator='mean', palette='pastel')
plt.title("Average Purchase Value by Device Type")
plt.ylabel("Average Purchase Value")
plt.xlabel("Device Type")
plt.tight_layout()
plt.show()

**Visualising data using graphs**

1. Operating System distribution
2. Geographic distribution (continents)
3. Page views distribution
4. Duration vs Purchase Value (scatter plot)
5. Mobile vs Desktop purchases
6. User Channel vs Purchase Value

**Insights:**

1) Windows and Macintosh dominate the user base with nearly equal and highest counts. Android and iOS have a substantial presence, indicating mobile users are also significant. Linux, Chrome OS, and Windows Phone have low usage. Optimization for Windows and Mac platforms is crucial, with a secondary focus on Android and iOS.
2) Americas has the maximum traffic source, followed by Asia and Europe. Africa and Oceania show very low counts. Marketing strategies should prioritize American, Asian, and European regions due to higher engagement.
3) Right-Skewed Distribution: Majority of users view fewer pages (less than 100), and very few users cross 300+. Most users don’t browse many pages, implying the need to improve content engagement or conversion in early sessions.
4) Log Scale Used: Purchase values span several orders of magnitude. No clear linear correlation; a few users in early and late sessions have extremely high purchase values (outliers). High-value purchases can occur at any session, so consistent nurturing across sessions is important.
5) Display and Referral channels generate the highest average purchase values. Direct and Paid Search also perform decently. Social and Affiliates channels have very low average purchase values.
Investment in Display and Referral marketing brings high returns; Social channels might need content re-strategy or reduced budget.

In [None]:
plt.style.use('default')
sns.set_palette("husl")
fig = plt.figure(figsize=(18,18))

plt.subplot(2, 3, 1)
os_counts = train_df['os'].value_counts().head(8)
plt.barh(range(len(os_counts)), os_counts.values, color='orange', alpha=0.7)
plt.title('Operating System Distribution', fontsize=12, fontweight='bold')
plt.xlabel('Count')
plt.yticks(range(len(os_counts)), os_counts.index)

plt.subplot(2, 3, 2)
continent_counts = train_df['geoNetwork.continent'].value_counts()
plt.bar(continent_counts.index, continent_counts.values, color='purple', alpha=0.7)
plt.title('Geographic Distribution by Continent', fontsize=12, fontweight='bold')
plt.xlabel('Continent')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(2, 3, 3)
plt.hist(train_df['pageViews'], bins=50, alpha=0.7, color='teal', edgecolor='black')
plt.title('Page Views Distribution', fontsize=12, fontweight='bold')
plt.xlabel('Page Views')
plt.ylabel('Frequency')
plt.yscale('log')

target_col = 'purchaseValue'
plt.subplot(2, 3, 4)
plt.scatter(train_df['sessionNumber'], train_df[target_col], alpha=0.5, s=10, color='red')
plt.title('Session Number vs Purchase Value', fontsize=12, fontweight='bold')
plt.xlabel('Session Number')
plt.ylabel('Purchase Value')
plt.yscale('log')

plt.subplot(2, 3, 5)
channel_purchases = train_df.groupby('userChannel')[target_col].mean().sort_values(ascending=False)
plt.bar(range(len(channel_purchases)), channel_purchases.values, color='gold', alpha=0.7)
plt.title('Average Purchase Value by User Channel', fontsize=12, fontweight='bold')
plt.xlabel('User Channel')
plt.ylabel('Average Purchase Value')
plt.xticks(range(len(channel_purchases)), channel_purchases.index, rotation=45)

**Correlation Heatmap**

**Insight:**
Positively Correlated Features: pageViews,totalHits, sessionNumber

Users who browse more pages, generate more hits, or have multiple sessions tend to spend more.


Negatively Correlated Features: device.isMobile, sessionId, sessionStart 

Mobile users ususally spend less, possibly due to limited functionality, smaller screen sizes, or intent differences.
sessionId and sessionStart are identifiers or timestamps, offering little direct predictive value.

Highly Correlated Pairs: sessionStart and date, totalHits and pageViews, sessionStart and totalHits, 
date and totalHits

These high correlations suggest multicollinearity. Drop one of each highly correlated pair to avoid redundancy.

Weak or No Correlation Features: userId, gclIdPresent, locationZone, trafficSource.adwordsClickInfo.page, and new_visits show very low correlation with purchaseValue.

These features may not contribute directly to predicting purchase behavior. Drop them.



In [None]:
corr = train_df.corr(numeric_only=True)
plt.figure(figsize=(10, 10))
sns.heatmap(
    corr,    
    cmap='coolwarm',        
    annot=True,             
    fmt='.2f',              
    square=True,            
    linewidths=0.5,         
    cbar_kws={"shrink": 0.8},  
    annot_kws={"size": 10}     
)
plt.title("Correlation Heatmap", fontsize=16, fontweight='bold')
plt.xticks(rotation=90, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

**Purchase Value vs Continent**

America has the highest median and a large number of outliers. This means that customers here tend to make significantly higher purchases compared to others. Europe, Asia, Africa, and Oceania show low purchase values, with only a few outliers. This may be due to lower overall e-commerce penetration or smaller average transaction sizes in those regions. The (not set) category also exhibits minimal purchase activity, representing unidentified or poorly logged regions.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=train_df, x='geoNetwork.continent', y='purchaseValue', palette="pastel")
plt.title('Purchase Value by Continent')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

**Dummy Submission**

In [None]:
df = pd.read_csv("/kaggle/input/engage-2-value-from-clicks-to-conversions/train_data.csv") 
X = df.drop("purchaseValue", axis=1) 
y = df['purchaseValue'] 
from sklearn.dummy import DummyRegressor 
model = DummyRegressor().fit(X,y) 
X_test = pd.read_csv("/kaggle/input/engage-2-value-from-clicks-to-conversions/test_data.csv") 
y_pred=model.predict(X_test) 
submission = pd.DataFrame({"id": range(0,X_test.shape[0]), "purchaseValue": y_pred}) 
submission.to_csv('submission.csv',index=False)

# Final

# Dropping columns

Dropping columns with high missing values,low variance or constant values (ie only 1 unique value), and columns which give similar information to prevent multicollinearity.

In [None]:
drop_cols = [
    'trafficSource.isTrueDirect', 'device.screenResolution', 'screenSize',
    'device.mobileDeviceBranding', 'device.mobileInputSelector',
    'device.mobileDeviceMarketingName', 'device.operatingSystemVersion',
    'device.flashVersion', 'totals.visits', 'geoNetwork.networkLocation',
    'trafficSource.adwordsClickInfo.isVideoAd', 'browserMajor',
    'device.browserSize', 'socialEngagementType', 'locationZone',
    'device.mobileDeviceModel', 'totals.bounces', 'device.language',
    'device.browserVersion', 'device.screenColors', 'new_visits','trafficSource.campaign',
    'trafficSource.adContent',
    'trafficSource.adwordsClickInfo.slot',
    'trafficSource.adwordsClickInfo.adNetworkType','trafficSource.adwordsClickInfo.page'
]
train_df.drop(columns=drop_cols, inplace=True, errors='ignore')
test_df.drop(columns=drop_cols, inplace=True, errors='ignore')

# Feature Engineering

In [None]:
train_df['date'] = pd.to_datetime(train_df['date'], format='%Y%m%d')
test_df['date'] = pd.to_datetime(test_df['date'], format='%Y%m%d')
for df in [train_df, test_df]:
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_month'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['device_os'] = df['deviceType'].astype(str) + '_' + df['os'].astype(str)
    df['geo_region'] = df['locationCountry'].astype(str) + '_' + df['geoNetwork.region'].astype(str)
    df['views_per_hit'] = df['pageViews'] / (df['totalHits'] + 1)
    df['views_per_session'] = df['pageViews'] / (df['sessionNumber'] + 1)
    df['hits_per_session'] = df['totalHits'] / (df['sessionNumber'] + 1)
    df['hits_per_page'] = df['totalHits'] / (df['pageViews'] + 1)

Grouping by User Id and figuring out user level statistics

In [None]:
user_stats = train_df.groupby('userId')['purchaseValue'].agg(['mean', 'sum', 'count', 'std']).reset_index()
user_stats.columns = ['userId', 'user_mean', 'user_sum', 'user_count', 'user_std']
train_df = train_df.merge(user_stats, on='userId', how='left')
test_df = test_df.merge(user_stats, on='userId', how='left')

# Encoding

Label Encoding columns which have low cardinality.

Frequency Encoding columns which have medium cardinality.

Target Encoding columns with high cardinality.

In [None]:
label_cols = [
    'deviceType', 'os', 'device.isMobile', 'gclIdPresent',
    'trafficSource.medium', 'userChannel', 'geoNetwork.continent'
]

for col in label_cols:
    if col in train_df.columns:
        le = LabelEncoder()
        full_vals = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
        le.fit(full_vals)
        train_df[col] = le.transform(train_df[col].astype(str))
        test_df[col] = le.transform(test_df[col].astype(str))

freq_cols = [
    'trafficSource', 'geoNetwork.region', 'geoNetwork.city',
    'geoNetwork.metro', 'locationCountry', 'geoCluster'
]
for col in freq_cols:
    if col in train_df.columns:
        freqs = train_df[col].value_counts() / len(train_df)
        train_df[col] = train_df[col].map(freqs).fillna(0)
        test_df[col] = test_df[col].map(freqs).fillna(0)

target = 'purchaseValue'
high_card_cols = ['browser', 'trafficSource.keyword', 'trafficSource.referralPath']
for col in high_card_cols:
    if col in train_df.columns:
        means = train_df.groupby(col)[target].mean()
        train_df[col + '_te'] = train_df[col].map(means)
        test_df[col + '_te'] = test_df[col].map(means)

# Preprocessing and Imputation

In [None]:
id_col = 'sessionId'
group_col = 'userId'
X = train_df.drop(columns=[target, id_col, 'date'], errors='ignore')
y = train_df[target]
X_test = test_df.drop(columns=[id_col, 'date'], errors='ignore')
groups = train_df[group_col]

for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    full_vals = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(full_vals)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

X_test = X_test[X.columns]

imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X.columns)

In [None]:
X.head()

In [None]:
y.head()

# Model Selection and Hyper Parameter Tuning

Insight:

RandomForest achieved the best performance with a mean R² of 0.6272 and an RMSE of approximately 117 million, indicating it generalizes well and handles the data’s non-linear patterns and categorical features effectively. GradientBoosting and AdaBoost followed with moderate R² scores of 0.3754 and 0.4314, respectively, but with higher RMSEs, suggesting they capture some useful patterns but may be limited by sensitivity to noise or shallow learners. Surprisingly, XGBoost, despite often being a top performer in many competitions, gave a negative R² score (-0.0514) and the highest RMSE (~165 million), implying it performed worse than a baseline mean predictor on cross-validation. This suggests overfitting, potential data leakage (e.g., target leakage or improper encoding), or misuse of high-cardinality features like userId or sessionId.

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

models = {
    'XGBoost': XGBRegressor(
        objective='reg:squarederror',
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        random_state=42,
        tree_method='hist',
        verbosity=0
    ),
    'RandomForest': RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        n_jobs=-1,
        random_state=42
    ),
    'GradientBoosting': GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    ),
    'AdaBoost': AdaBoostRegressor(
        n_estimators=200,
        learning_rate=0.05,
        random_state=42
    )
}

gkf = GroupKFold(n_splits=5)

print("Model Performance (Mean R² and RMSE using GroupKFold):")
for name, model in models.items():
    results = cross_validate(
        model, X, y,
        groups=groups,
        cv=gkf,
        scoring={'r2': 'r2', 'rmse': rmse_scorer},
        n_jobs=-1,
        return_train_score=False
    )
    mean_r2 = results['test_r2'].mean()
    mean_rmse = -results['test_rmse'].mean()  
    print(f"{name}: Mean R² = {mean_r2:.4f}, RMSE = {mean_rmse:.4f}")


In [None]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
param_dist = {
    'n_estimators': [100, 300],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}
gkf = GroupKFold(n_splits=5)
from sklearn.metrics import make_scorer, r2_score
search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=5,  
    cv=gkf,
    scoring=make_scorer(r2_score),
    random_state=42,
    n_jobs=-1,
    verbose=1
)
search.fit(X, y, groups=groups)
best_rf = search.best_estimator_
print("Best Hyperparameters:", search.best_params_)
print("Best R² Score:", search.best_score_)

In [None]:
best_model = search.best_estimator_
best_model.fit(X, y)
preds = best_model.predict(X_test)
preds = np.clip(preds, 0, None)

In [None]:
xgb = XGBRegressor(
    objective='reg:squarederror',
    n_jobs=-1,
    random_state=42,
    verbosity=0,
    tree_method='hist',
    grow_policy='lossguide',
)

param_dist = {
    'n_estimators': [500, 700, 1000],
    'learning_rate': [0.01, 0.03, 0.05],
    'max_depth': [6, 8, 10],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2.0]
}

gkf = GroupKFold(n_splits=5)
search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    scoring='r2',
    n_iter=30,
    cv=gkf,
    n_jobs=-1,
    verbose=0,
    random_state=42
)
search.fit(X, y, groups=groups)

print("Best Params:", search.best_params_)
print("Best R² Score:", search.best_score_)

# Prediction

In [None]:
best_model = search.best_estimator_
best_model.fit(X, y)
preds = best_model.predict(X_test)
preds = np.clip(preds, 0, None)

# Submission

In [None]:
sample_sub = pd.read_csv("/kaggle/input/engage-2-value-from-clicks-to-conversions/sample_submission.csv")
sample_sub['purchaseValue'] = preds
if 'ID' in sample_sub.columns:
    sample_sub.rename(columns={'ID': 'id'}, inplace=True)
sample_sub.to_csv("submission.csv", index=False)
print("submission.csv saved with shape:", sample_sub.shape)