In [1]:
import warnings
import datetime as dt
import numpy as np
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import classification_report, f1_score
from xgboost import XGBRegressor
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from scipy.stats import randint as sp_randint, uniform as sp_uniform
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Load datasets
train_df = pd.read_csv('/kaggle/input/engage-2-value-from-clicks-to-conversions/train_data.csv')
test_df = pd.read_csv('/kaggle/input/engage-2-value-from-clicks-to-conversions/test_data.csv')
subm_df = pd.read_csv('/kaggle/input/engage-2-value-from-clicks-to-conversions/sample_submission.csv')

train = train_df.loc[:, train_df.nunique() != 1]
test = test_df.loc[:, test_df.nunique() != 1]

# Drop duplicates
train = train.drop_duplicates()

train_clean = train.loc[:, train.isnull().mean() < 0.5]
test_clean = test.loc[:, test.isnull().mean() < 0.5]

# Drop unnecessary columns
cols_to_drop = [ 'trafficSource.campaign', 'geoNetwork.networkDomain', 'geoNetwork.region',
                'geoNetwork.city', 'geoNetwork.metro' ]
train.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)

# Define column types for preprocessing
categorical_columns = ['browser', 'trafficSource.keyword', 'os', 'geoCluster', 
                       'trafficSource', 'trafficSource.medium', 'trafficSource.referralPath',
                       'deviceType', 'userChannel',  'geoNetwork.continent', 
                       'geoNetwork.subContinent', 'locationCountry']
numerical_columns = ['sessionId','sessionNumber', 'pageViews', 'totalHits', 'sessionStart', 'userId','gclIdPresent']
boolean_columns = ['device.isMobile']

# Create preprocessing pipelines for different feature types
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values with most frequent
    ('encoder', OneHotEncoder(handle_unknown='ignore'))    # One-Hot Encoding for categorical features
])

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values by replacing with median
    ('scaler', StandardScaler())  # Scale the numerical features
])

boolean_pipeline = Pipeline([
    ('encoder', OrdinalEncoder())  # Use OrdinalEncoder for boolean features
])

# Combine all pipelines into one ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_pipeline, categorical_columns),
        ('numerical', numerical_pipeline, numerical_columns),
        ('boolean', boolean_pipeline, boolean_columns)
    ])

# Define the model for regression (XGBRegressor) with best hyperparameters
model = XGBRegressor(
    use_label_encoder=False, 
    eval_metric='logloss', 
    random_state=42, 
    n_jobs=-1,
    n_estimators=766,
    max_depth=12,
    learning_rate=0.04,
    subsample=0.8823,
    colsample_bytree=0.7021,
    gamma=0.2306,
    min_child_weight=3,
    reg_alpha=0.4916,
    reg_lambda=1.2001,
    scale_pos_weight=5
)

# Create the full pipeline including preprocessing and modeling
full_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)
])

# Prepare the training data
X_train = train.drop('purchaseValue', axis=1)
y_train = train['purchaseValue']

# Train the model
full_pipeline.fit(X_train, y_train)

# Prepare validation data
X = train_df.drop(columns=['purchaseValue'])
y = train_df['purchaseValue']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluate the model's performance on the validation data
score = r2_score(y_val, full_pipeline.predict(X_val))
print("R-squared score on validation data:", score)

# Calculate the Mean Squared Error (MSE) for further evaluation
mse = mean_squared_error(y_val, full_pipeline.predict(X_val))
print("Mean Squared Error on validation data:", mse)

# Make predictions on the test data
X_test = test  # Test dataset without the target variable
test_preds = full_pipeline.predict(X_test)

# Ensure predictions are non-negative (if needed, depending on your problem)
test_preds = np.clip(test_preds, 0, None)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'id': subm_df['ID'],  # ID from the sample submission file
    'purchaseValue': test_preds  # The predicted purchase values
})

# Save the submission to a CSV file
submission.to_csv("submission.csv", index=False)

print("Submission file created successfully!")

R-squared score on validation data: 0.9802896272530597
Mean Squared Error on validation data: 931680178976162.0
Submission file created successfully!
