<a href="https://colab.research.google.com/github/21f1002963/Colab/blob/main/MLProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np

In [4]:
train_data=pd.read_csv("train_data.csv")

In [5]:
train = train_data.copy()
train['purchaseFlag'] = (train['purchaseValue'] > 0).astype(int)
y_class = train['purchaseFlag']
y_reg = train['purchaseValue']
train = train.drop(columns=['purchaseValue', 'purchaseFlag'])

In [6]:
train = train.dropna(subset=['pageViews'])
y_class = y_class.loc[train.index]
y_reg = y_reg.loc[train.index]

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols_to_drop = [
            'trafficSource.adContent',
            'trafficSource.adwordsClickInfo.adNetworkType',
            'trafficSource.adwordsClickInfo.isVideoAd',
            'trafficSource.adwordsClickInfo.page',
            'trafficSource.adwordsClickInfo.slot',
            'device.screenResolution',
            'trafficSource.keyword',
            'screenSize',
            'device.mobileDeviceBranding',
            'device.mobileInputSelector',
            'userId',
            'trafficSource.campaign',
            'device.mobileDeviceMarketingName',
            'device.operatingSystemVersion',
            'device.flashVersion',
            'totals.visits',
            'geoNetwork.networkLocation',
            'browserMajor',
            'device.browserSize',
            'socialEngagementType',
            'locationZone',
            'device.mobileDeviceModel',
            'device.language',
            'device.browserVersion',
            'device.screenColors',
        ]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Drop noisy columns
        X.drop(columns=[col for col in self.cols_to_drop if col in X.columns], inplace=True, errors='ignore')

        # Basic type fixes
        X['trafficSource.isTrueDirect'] = X['trafficSource.isTrueDirect'].fillna(False).astype(int)
        X['device.isMobile'] = X['device.isMobile'].astype(int)

        def bucket_referral(path):
            path = str(path)
            if path == '/':
                return 'direct_landing'
            elif 'offer' in path or 'deal' in path or 'discount' in path:
                return 'known_referrer'
            elif 'mail' in path:
                return 'known_referrer'
            elif 'google-merchandise-store' in path:
                return 'known_referrer'
            elif '/yt/' in path:
                return 'known_referrer'
            elif '/a/google.com/' in path:
                return 'known_referrer'
            elif 'redirect' in path or 'l.php' in path or '/url' in path:
                return 'known_referrer'
            elif path == 'no_referral' or path.strip() == '':
                return 'unknown'
            else:
                return 'unknown'

        X['trafficSource.referralPath'] = X['trafficSource.referralPath'].fillna('no_referral').apply(bucket_referral)
        X['totals.bounces'] = X['totals.bounces'].fillna(2).astype(float)

        X['new_visits'] = X['new_visits'].fillna(0)

        X['geoNetwork.city'] = X['geoNetwork.city'].replace({'not available in demo dataset': 'Unknown','(not set)': 'Not_Set'})
        top_cities = X['geoNetwork.city'].value_counts().head(70).index.tolist()
        X['geoNetwork.city'] = X['geoNetwork.city'].apply(lambda x: x if x in top_cities else 'Other')

        X['geoNetwork.metro'] = X['geoNetwork.metro'].replace({'not available in demo dataset': 'Unknown','(not set)': 'Not_Set'})
        top_metros = X['geoNetwork.metro'].value_counts().head(40).index.tolist()
        X['geoNetwork.metro'] = X['geoNetwork.metro'].apply(lambda x: x if x in top_metros else 'Other')

        # Log transform
        def session_bucket(n):
            if n == 1:
                return 'first_time'
            elif n <= 3:
                return 'return_early'
            elif n <= 7:
                return 'return_mid'
            elif n <= 15:
                return 'return_late'
            else:
                return 'loyal_or_power'
        X['session_bucket'] = X['sessionNumber'].apply(session_bucket)
        X['sessionNumber'] = np.log1p(X['sessionNumber'])

        # Time features
        if 'sessionStart' in X.columns:
            dt = pd.to_datetime(X['sessionStart'], unit='s')
            X['hour'] = dt.dt.hour
            X['min'] = dt.dt.minute

        if 'date' in X.columns:
            X['date'] = pd.to_datetime(X['date'], format='%Y%m%d')
            X['day'] = X['date'].dt.day
            X['month'] = X['date'].dt.month
            X['year'] = X['date'].dt.year
            X['quarter'] = X['date'].dt.quarter
            X['semester'] = np.where(X['quarter'].isin([1, 2]), 1, 2)
            X['day_of_week'] = X['date'].dt.dayofweek
            X['is_weekend'] = X['day_of_week'].isin([5, 6]).astype(int)
            X.drop(['date'], axis=1, inplace=True)

        X.drop(['sessionStart'], axis=1, errors='ignore', inplace=True)

        return X

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
# from category_encoders import TargetEncoder # Assuming TargetEncoder is used and needs importing

# high_card_cols = ['geoNetwork.region', 'trafficSource.referralPath', 'userChannel', 'locationCountry', 'trafficSource.medium', 'geoNetwork.subContinent', 'trafficSource', 'geoNetwork.metro', 'geoNetwork.city']

low_card_cols = ['geoNetwork.networkDomain', 'geoNetwork.continent', 'deviceType', 'geoCluster', 'os', 'browser', 'geoNetwork.region', 'trafficSource.referralPath', 'userChannel', 'locationCountry', 'trafficSource.medium', 'geoNetwork.subContinent', 'trafficSource', 'geoNetwork.metro', 'geoNetwork.city', 'session_bucket' ]

# Set up column transformer
preprocessor = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'), low_card_cols)
], remainder='passthrough')  # Keeps other features (numeric, target-encoded)

final_preprocessor = Pipeline([
    ('onehot_and_pass', preprocessor),  # your ColumnTransformer
])

# Assuming CustomPreprocessor is a custom class defined elsewhere and doesn't need importing
# Assuming TargetEncoderWithKFold is a custom class defined elsewhere and doesn't need importing


# Full pipeline
clf_pipeline = Pipeline([
    ('custom_cleaning', CustomPreprocessor()),
    # ('target_encoding', TargetEncoderWithKFold(cols=high_card_cols, log_target=True)),
    ('ohe_and_rest', final_preprocessor),
])

# Train
X_train_processed = clf_pipeline.fit_transform(train, y_class)
# print(X_train_processed.info())

# Grid Search CV
n_estimators = [20,40, 60, 80, 100,120]
max_features = [0.2, 0.4, 0.6, 0.8, 1.0]
max_depth = [2,8,None]
max_samples = [0.25, 0.5,0.75,1.0]
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
              'max_samples':max_samples,
              'max_depth': max_depth
             }

# rf_random = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_random = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_grid,
    n_iter=30,          # only 30 random combos
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)
rf_random.fit(X_train_processed, y_class)
rf_random.best_params_
rf_random.best_score_

# X_test_processed = clf_pipeline.transform(test_data)
# print(X_test_processed.info())
# purchase_flags = rf_random.predict(X_test_processed)

# mask = y_class == 1
# train_reg = train.loc[mask]
# y_reg_filtered = y_reg.loc[mask]


# reg_pipeline = Pipeline([
#     ('custom_cleaning', CustomPreprocessor()),
#     # ('target_encoding', TargetEncoderWithKFold(cols=high_card_cols, log_target=True)),
#     ('ohe_and_rest', final_preprocessor),
#     ('reg_model', GradientBoostingRegressor())
# ])
# reg_pipeline.fit(train_reg, y_reg_filtered)

  X['trafficSource.isTrueDirect'] = X['trafficSource.isTrueDirect'].fillna(False).astype(int)


Fitting 5 folds for each of 30 candidates, totalling 150 fits


np.float64(0.9639701762703099)

In [15]:
rf_random.best_params_


{'n_estimators': 80,
 'max_samples': 1.0,
 'max_features': 0.4,
 'max_depth': None}