In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/EcommerceDAnalysis1/data_rfm_pcapped_3dt1.pkl'
data_rfm_pcapped_3dt1 = pd.read_pickle(file_path, )

# Inspect the data
# Percentage based capped data 0.1% & 99%
print(data_rfm_pcapped_3dt1.head())
print(data_rfm_pcapped_3dt1.info())
print(data_rfm_pcapped_3dt1.shape)
print(data_rfm_pcapped_3dt1.describe().T)
print(data_rfm_pcapped_3dt1.nunique())
print(data_rfm_pcapped_3dt1.isnull().sum())
print(data_rfm_pcapped_3dt1.duplicated().sum())
print(data_rfm_pcapped_3dt1[~data_rfm_pcapped_3dt1.isnull().any(axis=1) & ~data_rfm_pcapped_3dt1.duplicated()])
print((data_rfm_pcapped_3dt1['Quantity'] < 0).sum(), (data_rfm_pcapped_3dt1['UnitPrice'] < 0).sum())
print(data_rfm_pcapped_3dt1.columns)

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  TotalPrice  \
0 2010-12-01 08:26:00       2.55       17850  United Kingdom       15.30   
1 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   
2 2010-12-01 08:26:00       2.75       17850  United Kingdom       22.00   
3 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   
4 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   

   DayOfWeek  ...  Month  Year  recency  frequency  monetary  recency_score  \
0          2  ...     12  2010 

In [3]:
data_pcapped_3 = data_rfm_pcapped_3dt1.copy()

# Drop the specified columns
columns_to_drop = ['DayOfWeek', 'HourOfDay', 'Month', 'Year', 'recency',
                   'frequency', 'monetary', 'recency_score', 'frequency_score',
                   'monetary_score', 'RFM_Score', 'Segment']

data_pcapped_3 = data_pcapped_3.drop(columns=columns_to_drop)

# Check the result
print(data_pcapped_3.columns)


Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'TotalPrice'],
      dtype='object')


In [4]:
# Copy the data for invoice date feature extraction
data_invoice_date = data_pcapped_3.copy()

# Convert InvoiceDate to datetime
data_invoice_date['InvoiceDate'] = pd.to_datetime(data_invoice_date['InvoiceDate'], format='%m/%d/%Y %H:%M')

# Extract additional features from InvoiceDate
data_invoice_date['DayOfWeek'] = data_invoice_date['InvoiceDate'].dt.dayofweek
data_invoice_date['HourOfDay'] = data_invoice_date['InvoiceDate'].dt.hour
data_invoice_date['Month'] = data_invoice_date['InvoiceDate'].dt.month
data_invoice_date['Year'] = data_invoice_date['InvoiceDate'].dt.year

# Check the newly extracted features
print(data_invoice_date[['InvoiceDate', 'DayOfWeek', 'HourOfDay', 'Month', 'Year']].head())

          InvoiceDate  DayOfWeek  HourOfDay  Month  Year
0 2010-12-01 08:26:00          2          8     12  2010
1 2010-12-01 08:26:00          2          8     12  2010
2 2010-12-01 08:26:00          2          8     12  2010
3 2010-12-01 08:26:00          2          8     12  2010
4 2010-12-01 08:26:00          2          8     12  2010


In [5]:
print(data_invoice_date.head())
print(data_invoice_date.info())
print(data_invoice_date.shape)
print(data_invoice_date.describe().T)
print(data_invoice_date.nunique())
print(data_invoice_date.isnull().sum())
print(data_invoice_date.duplicated().sum())
print(data_invoice_date.columns)
print(data_invoice_date['TotalPrice'].describe().T)

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  TotalPrice  \
0 2010-12-01 08:26:00       2.55       17850  United Kingdom       15.30   
1 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   
2 2010-12-01 08:26:00       2.75       17850  United Kingdom       22.00   
3 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   
4 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   

   DayOfWeek  HourOfDay  Month  Year  
0          2          8     12  2010  
1          2          8     12  

In [6]:
data_invoice_date1 = data_invoice_date.copy()
# Define the threshold for high-value purchase
threshold = 20  # You can adjust this value
data_invoice_date1['HighValuePurchase'] = (data_invoice_date1['TotalPrice'] > threshold).astype(int)


In [7]:
print(data_invoice_date1.head())
print(data_invoice_date1.info())
print(data_invoice_date1.isnull().sum())
print(data_invoice_date1.nunique())
print(data_invoice_date1.columns)

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  TotalPrice  \
0 2010-12-01 08:26:00       2.55       17850  United Kingdom       15.30   
1 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   
2 2010-12-01 08:26:00       2.75       17850  United Kingdom       22.00   
3 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   
4 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   

   DayOfWeek  HourOfDay  Month  Year  HighValuePurchase  
0          2          8     12  2010                

In [8]:
import pandas as pd

# Step 1: Ensure the data is sorted by CustomerID and InvoiceDate
data_target_variable = data_invoice_date1.copy()  # Replace with your actual dataframe
data_target_variable = data_target_variable.sort_values(by=['CustomerID', 'InvoiceDate'])

# Step 2: Filter data for high-value purchases only
data_high_value = data_target_variable[data_target_variable['HighValuePurchase'] == 1]

# Step 3: Calculate Recency (days since last high-value purchase)
recency = data_high_value.groupby('CustomerID')['InvoiceDate'].max()  # Most recent high-value purchase
recency = (data_target_variable['InvoiceDate'].max() - recency).dt.days  # Days since last purchase

# Step 4: Calculate Frequency (number of high-value purchases made by the customer)
frequency = data_high_value.groupby('CustomerID').size()

# Step 5: Calculate Monetary (total spent on high-value purchases by the customer)
monetary = data_high_value.groupby('CustomerID')['TotalPrice'].sum()

# Step 6: Merge Recency, Frequency, and Monetary into a single DataFrame
rfm = pd.DataFrame({
    'Recency': recency,
    'Frequency': frequency,
    'Monetary': monetary
})

# Step 7: Reset index for ease of use
rfm.reset_index(inplace=True)

# Display the RFM table
print(rfm.head())
print(data_target_variable.head())
print(data_target_variable.info())
print(data_target_variable.isnull().sum())
print(data_target_variable.nunique())

   CustomerID  Recency  Frequency  Monetary
0       12347        1         65   2543.83
1       12348       74         22   1367.44
2       12349       18         18    627.97
3       12350      309          5    121.20
4       12352       35          8    232.60
      InvoiceNo StockCode                          Description  Quantity  \
14329    537626     85116      BLACK CANDELABRA T-LIGHT HOLDER        12   
14330    537626     22375    AIRLINE BAG VINTAGE JET SET BROWN         4   
14331    537626     71477    COLOUR GLASS. STAR T-LIGHT HOLDER        12   
14332    537626     22492              MINI PAINT SET VINTAGE         36   
14333    537626     22771  CLEAR DRAWER KNOB ACRYLIC EDWARDIAN        12   

              InvoiceDate  UnitPrice  CustomerID  Country  TotalPrice  \
14329 2010-12-07 14:57:00       2.10       12347  Iceland        25.2   
14330 2010-12-07 14:57:00       4.25       12347  Iceland        17.0   
14331 2010-12-07 14:57:00       3.25       12347  Iceland   

In [9]:
# Step 1: Ensure the data is sorted by CustomerID and InvoiceDate
data_target_variable1 = data_target_variable.copy()  # Replace with your actual dataframe
data_target_variable1 = data_target_variable1.sort_values(by=['CustomerID', 'InvoiceDate'])

# Step 2: Filter data for high-value purchases only
data_high_value = data_target_variable1[data_target_variable1['HighValuePurchase'] == 1].copy()  # Create a copy to avoid view issues
print(f"High-value data size after filtering: {data_high_value.shape}")

# Step 3: Ensure no duplicates in high-value data (in case there are duplicate rows for the same purchase)
data_high_value = data_high_value.drop_duplicates(subset=['CustomerID', 'InvoiceDate'])
print(f"High-value data size after removing duplicates: {data_high_value.shape}")

# Step 4: Shift the 'InvoiceDate' to create 'Next_HighValue_Purchase_Date' for each customer
data_high_value.loc[:, 'Next_HighValue_Purchase_Date'] = data_high_value.groupby('CustomerID')['InvoiceDate'].shift(-1)

# Step 5: Fill NaT values in 'Next_HighValue_Purchase_Date' (i.e., no next high-value purchase)
data_high_value.loc[:, 'Next_HighValue_Purchase_Date'] = data_high_value['Next_HighValue_Purchase_Date'].fillna(pd.Timestamp('2100-12-31'))  # Use a far future date

# Step 6: Calculate 'HighValuePurchase_Behavior' based on whether the next high-value purchase date is within 30 days
data_high_value.loc[:, 'HighValuePurchase_Behavior'] = data_high_value.apply(
    lambda row: 1 if ((row['Next_HighValue_Purchase_Date'] - row['InvoiceDate']).days <= 30)
               else 0, axis=1)

# Step 7: Merge back to the original data (optional)
# This will allow us to have the 'HighValuePurchase_Behavior' for each customer, whether they made a high-value purchase or not.
data_target_variable1 = data_target_variable1.merge(
    data_high_value[['CustomerID', 'InvoiceDate', 'HighValuePurchase_Behavior']],
    on=['CustomerID', 'InvoiceDate'],
    how='left'
)

# Step 8: Check the result size and ensure no increase in data size
print(f"Data size after merge: {data_target_variable1.shape}")

# Step 9: Fill missing values in the 'HighValuePurchase_Behavior' column with 0 for customers who didn't make any high-value purchase
data_target_variable1.loc[:, 'HighValuePurchase_Behavior'] = data_target_variable1['HighValuePurchase_Behavior'].fillna(0)

# Step 10: Display the final result with relevant columns
print(f"First few rows of the final data:")
print(data_target_variable1[['CustomerID', 'InvoiceDate', 'HighValuePurchase_Behavior']].head())
print(data_target_variable1.head())
print(data_target_variable1.info())
print(data_target_variable1.isnull().sum())
print(data_target_variable1.nunique())


High-value data size after filtering: (98522, 14)
High-value data size after removing duplicates: (28885, 14)
Data size after merge: (522695, 15)
First few rows of the final data:
   CustomerID         InvoiceDate  HighValuePurchase_Behavior
0       12347 2010-12-07 14:57:00                         0.0
1       12347 2010-12-07 14:57:00                         0.0
2       12347 2010-12-07 14:57:00                         0.0
3       12347 2010-12-07 14:57:00                         0.0
4       12347 2010-12-07 14:57:00                         0.0
  InvoiceNo StockCode                          Description  Quantity  \
0    537626     85116      BLACK CANDELABRA T-LIGHT HOLDER        12   
1    537626     22375    AIRLINE BAG VINTAGE JET SET BROWN         4   
2    537626     71477    COLOUR GLASS. STAR T-LIGHT HOLDER        12   
3    537626     22492              MINI PAINT SET VINTAGE         36   
4    537626     22771  CLEAR DRAWER KNOB ACRYLIC EDWARDIAN        12   

          Invoi

In [10]:
# Step 1: Merge the RFM features (Recency, Frequency, Monetary) with the target variable 'HighValuePurchase_Behavior'
final_data = data_target_variable1.merge(rfm, on='CustomerID', how='left')

# Step 2: Clean the data by filling any missing values or correcting data issues
final_data.fillna(0, inplace=True)  # Fill missing values for Recency, Frequency, and Monetary with 0 (or a suitable value)

# Step 3: Display the final result with relevant columns
print(final_data[['CustomerID', 'InvoiceDate', 'HighValuePurchase_Behavior', 'Recency', 'Frequency', 'Monetary']].head())
print(final_data.head())
print(final_data.info())
print(final_data.isnull().sum())
print(final_data.nunique())

   CustomerID         InvoiceDate  HighValuePurchase_Behavior  Recency  \
0       12347 2010-12-07 14:57:00                         0.0      1.0   
1       12347 2010-12-07 14:57:00                         0.0      1.0   
2       12347 2010-12-07 14:57:00                         0.0      1.0   
3       12347 2010-12-07 14:57:00                         0.0      1.0   
4       12347 2010-12-07 14:57:00                         0.0      1.0   

   Frequency  Monetary  
0       65.0   2543.83  
1       65.0   2543.83  
2       65.0   2543.83  
3       65.0   2543.83  
4       65.0   2543.83  
  InvoiceNo StockCode                          Description  Quantity  \
0    537626     85116      BLACK CANDELABRA T-LIGHT HOLDER        12   
1    537626     22375    AIRLINE BAG VINTAGE JET SET BROWN         4   
2    537626     71477    COLOUR GLASS. STAR T-LIGHT HOLDER        12   
3    537626     22492              MINI PAINT SET VINTAGE         36   
4    537626     22771  CLEAR DRAWER KNOB ACRY

In [11]:
final_data1 = final_data.copy()

final_data1 = final_data1.drop(columns=['InvoiceNo', 'StockCode', 'Description', 'InvoiceDate', 'Country'])

print(final_data1.head())
print(final_data1.info())
print(final_data1.columns)

   Quantity  UnitPrice  CustomerID  TotalPrice  DayOfWeek  HourOfDay  Month  \
0        12       2.10       12347        25.2          1         14     12   
1         4       4.25       12347        17.0          1         14     12   
2        12       3.25       12347        39.0          1         14     12   
3        36       0.65       12347        23.4          1         14     12   
4        12       1.25       12347        15.0          1         14     12   

   Year  HighValuePurchase  HighValuePurchase_Behavior  Recency  Frequency  \
0  2010                  1                         0.0      1.0       65.0   
1  2010                  0                         0.0      1.0       65.0   
2  2010                  1                         0.0      1.0       65.0   
3  2010                  1                         0.0      1.0       65.0   
4  2010                  0                         0.0      1.0       65.0   

   Monetary  
0   2543.83  
1   2543.83  
2   2543.83  


In [12]:
from sklearn.preprocessing import StandardScaler

data_standardised = final_data1.copy()
# List of numerical features to standardize (continuous variables)
numerical_columns = ['Recency', 'Frequency', 'Monetary', 'Quantity', 'UnitPrice', 'TotalPrice']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical columns
data_standardised[numerical_columns] = scaler.fit_transform(data_standardised[numerical_columns])

# Display the first few rows of the standardized data
print(data_standardised.head())
print(data_standardised.info())
print(data_standardised.columns)

   Quantity  UnitPrice  CustomerID  TotalPrice  DayOfWeek  HourOfDay  Month  \
0  0.204503  -0.348438       12347    0.327834          1         14     12   
1 -0.317069   0.340829       12347    0.014026          1         14     12   
2  0.204503   0.020239       12347    0.855949          1         14     12   
3  1.769220  -0.813293       12347    0.258949          1         14     12   
4  0.204503  -0.620939       12347   -0.062512          1         14     12   

   Year  HighValuePurchase  HighValuePurchase_Behavior  Recency  Frequency  \
0  2010                  1                         0.0 -0.55869  -0.091642   
1  2010                  0                         0.0 -0.55869  -0.091642   
2  2010                  1                         0.0 -0.55869  -0.091642   
3  2010                  1                         0.0 -0.55869  -0.091642   
4  2010                  0                         0.0 -0.55869  -0.091642   

   Monetary  
0 -0.137618  
1 -0.137618  
2 -0.137618  


In [13]:
from sklearn.model_selection import train_test_split

# Step 1: Define the feature set (X) and target variable (y)
X = data_standardised.drop(columns=['HighValuePurchase_Behavior'])  # Features
y = data_standardised['HighValuePurchase_Behavior']  # Target

# Step 2: Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits to verify
print(f"Training set size: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Test set size: X_test: {X_test.shape}, y_test: {y_test.shape}")


Training set size: X_train: (418156, 12), y_train: (418156,)
Test set size: X_test: (104539, 12), y_test: (104539,)


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Step 3: Create and train Logistic Regression model
logreg_model = LogisticRegression(max_iter=500,random_state=42)

# Fit the model
logreg_model.fit(X_train, y_train)

# Step 4: Make predictions on the test set
y_pred = logreg_model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, logreg_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Output the evaluation metrics
print(f"Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Confusion Matrix:\n{cm}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Model Evaluation:
Accuracy: 0.7573
Precision: 0.7616
Recall: 0.3497
F1-Score: 0.4793
ROC-AUC: 0.7943
Confusion Matrix:
[[67491  3655]
 [21715 11678]]


In [15]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Create and train Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Fit the model
rf_model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Step 3: Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)

# Output the evaluation metrics
print(f"Random Forest Model Evaluation:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"ROC-AUC: {roc_auc_rf:.4f}")
print(f"Confusion Matrix:\n{cm_rf}")

Random Forest Model Evaluation:
Accuracy: 0.9772
Precision: 0.9729
Recall: 0.9553
F1-Score: 0.9641
ROC-AUC: 0.9959
Confusion Matrix:
[[70258   888]
 [ 1491 31902]]


In [None]:
!pip install xgboost



In [None]:
import xgboost as xgb

# Step 1: Create and train XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Fit the model
xgb_model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Step 3: Evaluate the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm_xgb = confusion_matrix(y_test, y_pred_xgb)

# Output the evaluation metrics
print(f"XGBoost Model Evaluation:")
print(f"Accuracy: {accuracy_xgb:.4f}")
print(f"Precision: {precision_xgb:.4f}")
print(f"Recall: {recall_xgb:.4f}")
print(f"F1-Score: {f1_xgb:.4f}")
print(f"ROC-AUC: {roc_auc_xgb:.4f}")
print(f"Confusion Matrix:\n{cm_xgb}")

XGBoost Model Evaluation:
Accuracy: 0.8956
Precision: 0.8705
Recall: 0.7907
F1-Score: 0.8287
ROC-AUC: 0.9585
Confusion Matrix:
[[67218  3928]
 [ 6988 26405]]


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Step 1: Initialize CatBoostClassifier
catboost_model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, cat_features=[], random_seed=42, verbose=100)

# Step 2: Train the model
catboost_model.fit(X_train, y_train)

# Step 3: Make predictions on the test set
y_pred_catboost = catboost_model.predict(X_test)

# Step 4: Evaluate the model
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
precision_catboost = precision_score(y_test, y_pred_catboost)
recall_catboost = recall_score(y_test, y_pred_catboost)
f1_catboost = f1_score(y_test, y_pred_catboost)
roc_auc_catboost = roc_auc_score(y_test, catboost_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm_catboost = confusion_matrix(y_test, y_pred_catboost)

# Output the evaluation metrics
print(f"CatBoost Model Evaluation:")
print(f"Accuracy: {accuracy_catboost:.4f}")
print(f"Precision: {precision_catboost:.4f}")
print(f"Recall: {recall_catboost:.4f}")
print(f"F1-Score: {f1_catboost:.4f}")
print(f"ROC-AUC: {roc_auc_catboost:.4f}")
print(f"Confusion Matrix:\n{cm_catboost}")

0:	learn: 0.6446892	total: 149ms	remaining: 1m 14s
100:	learn: 0.3876239	total: 11.9s	remaining: 46.8s
200:	learn: 0.3576328	total: 40.6s	remaining: 1m
300:	learn: 0.3346303	total: 1m 4s	remaining: 42.9s
400:	learn: 0.3158218	total: 1m 15s	remaining: 18.7s
499:	learn: 0.2999372	total: 1m 26s	remaining: 0us
CatBoost Model Evaluation:
Accuracy: 0.8706
Precision: 0.8534
Recall: 0.7183
F1-Score: 0.7801
ROC-AUC: 0.9403
Confusion Matrix:
[[67027  4119]
 [ 9406 23987]]


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Perform cross-validation (5-folds) for Random Forest
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')  # You can change the scoring as needed

# Output the cross-validation results
print("Random Forest Model Cross-Validation Results:")
print(f"Accuracy for each fold: {rf_cv_scores}")
print(f"Mean accuracy: {np.mean(rf_cv_scores):.4f}")
print(f"Standard deviation: {np.std(rf_cv_scores):.4f}")


Random Forest Model Cross-Validation Results:
Accuracy for each fold: [0.74697481 0.76506376 0.74284238 0.71195439 0.74425812]
Mean accuracy: 0.7422
Standard deviation: 0.0171


In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Perform cross-validation (5-folds) for XGBoost
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='accuracy')  # You can change the scoring as needed

# Output the cross-validation results
print("XGBoost Model Cross-Validation Results:")
print(f"Accuracy for each fold: {xgb_cv_scores}")
print(f"Mean accuracy: {np.mean(xgb_cv_scores):.4f}")
print(f"Standard deviation: {np.std(xgb_cv_scores):.4f}")


XGBoost Model Cross-Validation Results:
Accuracy for each fold: [0.69676389 0.57297277 0.55089488 0.72644659 0.72568132]
Mean accuracy: 0.6546
Standard deviation: 0.0767


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np

# Define the parameter grid for RandomizedSearchCV
param_dist_rf = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2', None],  # Fixed value to avoid 'auto'
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Perform RandomizedSearchCV with 3-fold cross-validation and fewer iterations for faster performance
random_search_rf = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist_rf,
                                      n_iter=5, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)

# Fit the model to the training data
random_search_rf.fit(X_train, y_train)

# Best hyperparameters found
best_params_rf = random_search_rf.best_params_
print(f"Best hyperparameters for Random Forest: {best_params_rf}")

# Train the optimized model using the best parameters
best_rf_model = RandomForestClassifier(**best_params_rf, random_state=42)
best_rf_model.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred_optimized_rf = best_rf_model.predict(X_test)

# Step 6: Evaluate the optimized model
accuracy_rf = accuracy_score(y_test, y_pred_optimized_rf)
precision_rf = precision_score(y_test, y_pred_optimized_rf, zero_division=0)  # Added zero_division for safety
recall_rf = recall_score(y_test, y_pred_optimized_rf, zero_division=0)
f1_rf = f1_score(y_test, y_pred_optimized_rf, zero_division=0)
roc_auc_rf = roc_auc_score(y_test, best_rf_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm_rf = confusion_matrix(y_test, y_pred_optimized_rf)

# Output the evaluation metrics
print(f"Optimized Random Forest Model Evaluation:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"ROC-AUC: {roc_auc_rf:.4f}")
print(f"Confusion Matrix:\n{cm_rf}")




Best hyperparameters for Random Forest: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': None, 'bootstrap': True}
Optimized Random Forest Model Evaluation:
Accuracy: 0.9777
Precision: 0.9691
Recall: 0.9608
F1-Score: 0.9649
ROC-AUC: 0.9966
Confusion Matrix:
[[70124  1022]
 [ 1310 32083]]


In [18]:
# Step 1: Make predictions on the test set using the optimized Random Forest model
y_pred_rf_test = best_rf_model.predict(X_test)

# Step 2: Evaluate the optimized model on the test set
accuracy_rf_test = accuracy_score(y_test, y_pred_rf_test)
precision_rf_test = precision_score(y_test, y_pred_rf_test)
recall_rf_test = recall_score(y_test, y_pred_rf_test)
f1_rf_test = f1_score(y_test, y_pred_rf_test)
roc_auc_rf_test = roc_auc_score(y_test, best_rf_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix for the test set
cm_rf_test = confusion_matrix(y_test, y_pred_rf_test)

# Output the evaluation metrics
print(f"Optimized Random Forest Model Evaluation on Test Data:")
print(f"Accuracy: {accuracy_rf_test:.4f}")
print(f"Precision: {precision_rf_test:.4f}")
print(f"Recall: {recall_rf_test:.4f}")
print(f"F1-Score: {f1_rf_test:.4f}")
print(f"ROC-AUC: {roc_auc_rf_test:.4f}")
print(f"Confusion Matrix:\n{cm_rf_test}")


Optimized Random Forest Model Evaluation on Test Data:
Accuracy: 0.9777
Precision: 0.9691
Recall: 0.9608
F1-Score: 0.9649
ROC-AUC: 0.9966
Confusion Matrix:
[[70124  1022]
 [ 1310 32083]]


In [19]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np

# Define the parameter grid for RandomizedSearchCV
param_dist_rf = {
    'n_estimators': [100, 200, 300, 400, 500],  # Number of trees
    'max_depth': [10, 20, 30, None],  # Depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required at leaf node
    'max_features': ['sqrt', 'log2', None],  # Number of features to consider for best split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Set up RandomizedSearchCV
random_search_rf = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist_rf,
                                      n_iter=10, scoring='roc_auc', cv=5, n_jobs=-1, verbose=2, random_state=42)

# Fit the model to the training data
random_search_rf.fit(X_train, y_train)

# Best hyperparameters found by RandomizedSearchCV
best_params_rf = random_search_rf.best_params_
print(f"Best hyperparameters for Random Forest after fine-tuning: {best_params_rf}")

# Train the optimized Random Forest model using the best parameters
best_rf_model = RandomForestClassifier(**best_params_rf, random_state=42)
best_rf_model.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred_optimized_rf = best_rf_model.predict(X_test)

# Step 6: Evaluate the optimized model
accuracy_rf = accuracy_score(y_test, y_pred_optimized_rf)
precision_rf = precision_score(y_test, y_pred_optimized_rf)
recall_rf = recall_score(y_test, y_pred_optimized_rf)
f1_rf = f1_score(y_test, y_pred_optimized_rf)
roc_auc_rf = roc_auc_score(y_test, best_rf_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm_rf = confusion_matrix(y_test, y_pred_optimized_rf)

# Output the evaluation metrics
print(f"Optimized Random Forest Model Evaluation: (Fine-Tuned)")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"ROC-AUC: {roc_auc_rf:.4f}")
print(f"Confusion Matrix:\n{cm_rf}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits




KeyboardInterrupt: 

In [20]:
import joblib

# Define the path where you want to save the model in Google Drive
model_path = '/content/drive/MyDrive/ECommerce Analysis/RF_HighValuePurchases.pkl'

# Save the model using joblib
joblib.dump(best_rf_model, model_path)

print(f"Model saved to {model_path}")


Model saved to /content/drive/MyDrive/ECommerce Analysis/RF_HighValuePurchases.pkl
