In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/EcommerceDAnalysis1/data_rfm_pcapped_3dt1.pkl'
data_rfm_pcapped_3dt1 = pd.read_pickle(file_path, )

# Inspect the data
# Percentage based capped data 0.1% & 99%
print(data_rfm_pcapped_3dt1.head())
print(data_rfm_pcapped_3dt1.info())
print(data_rfm_pcapped_3dt1.shape)
print(data_rfm_pcapped_3dt1.describe().T)
print(data_rfm_pcapped_3dt1.nunique())
print(data_rfm_pcapped_3dt1.isnull().sum())
print(data_rfm_pcapped_3dt1.duplicated().sum())
print(data_rfm_pcapped_3dt1[~data_rfm_pcapped_3dt1.isnull().any(axis=1) & ~data_rfm_pcapped_3dt1.duplicated()])
print((data_rfm_pcapped_3dt1['Quantity'] < 0).sum(), (data_rfm_pcapped_3dt1['UnitPrice'] < 0).sum())
print(data_rfm_pcapped_3dt1.columns)

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  TotalPrice  \
0 2010-12-01 08:26:00       2.55       17850  United Kingdom       15.30   
1 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   
2 2010-12-01 08:26:00       2.75       17850  United Kingdom       22.00   
3 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   
4 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   

   DayOfWeek  ...  Month  Year  recency  frequency  monetary  recency_score  \
0          2  ...     12  2010 

In [None]:
data_pcapped_3 = data_rfm_pcapped_3dt1.copy()

# Drop the specified columns
columns_to_drop = ['DayOfWeek', 'HourOfDay', 'Month', 'Year', 'recency',
                   'frequency', 'monetary', 'recency_score', 'frequency_score',
                   'monetary_score', 'RFM_Score', 'Segment']

data_pcapped_3 = data_pcapped_3.drop(columns=columns_to_drop)

# Check the result
print(data_pcapped_3.columns)


Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'TotalPrice'],
      dtype='object')


In [None]:
# Copy the data for invoice date feature extraction
data_invoice_date = data_pcapped_3.copy()

# Convert InvoiceDate to datetime
data_invoice_date['InvoiceDate'] = pd.to_datetime(data_invoice_date['InvoiceDate'], format='%m/%d/%Y %H:%M')

# Extract additional features from InvoiceDate
data_invoice_date['DayOfWeek'] = data_invoice_date['InvoiceDate'].dt.dayofweek
data_invoice_date['HourOfDay'] = data_invoice_date['InvoiceDate'].dt.hour
data_invoice_date['Month'] = data_invoice_date['InvoiceDate'].dt.month
data_invoice_date['Year'] = data_invoice_date['InvoiceDate'].dt.year

# Check the newly extracted features
print(data_invoice_date[['InvoiceDate', 'DayOfWeek', 'HourOfDay', 'Month', 'Year']].head())

          InvoiceDate  DayOfWeek  HourOfDay  Month  Year
0 2010-12-01 08:26:00          2          8     12  2010
1 2010-12-01 08:26:00          2          8     12  2010
2 2010-12-01 08:26:00          2          8     12  2010
3 2010-12-01 08:26:00          2          8     12  2010
4 2010-12-01 08:26:00          2          8     12  2010


In [None]:
# Copy the data for RFM feature engineering
data_rfm = data_invoice_date.copy()

# Calculate Recency, Frequency, and Monetary for each customer
rfm_metrics = data_rfm.groupby('CustomerID').agg(
    Recency=('InvoiceDate', lambda x: (x.max() - x.min()).days),  # Days since last purchase
    Frequency=('InvoiceNo', 'nunique'),  # Count of unique purchases
    Monetary=('TotalPrice', 'sum')  # Total monetary value spent by customer
).reset_index()

# Merge the RFM metrics back into the data_rfm dataframe
data_rfm = data_rfm.merge(rfm_metrics, on='CustomerID', how='left')

# Check the data with RFM features
print(data_rfm[['CustomerID', 'Recency', 'Frequency', 'Monetary']].head())

   CustomerID  Recency  Frequency  Monetary
0       17850      370         47   5469.93
1       17850      370         47   5469.93
2       17850      370         47   5469.93
3       17850      370         47   5469.93
4       17850      370         47   5469.93


In [None]:
# Step 1: Copy the original data for segmentation
data_rfm_segmentation = data_rfm.copy()

# Step 2: Create customer segments based on Recency, Frequency, and Monetary
data_rfm_segmentation['Customer_Segment'] = 'Low-Value'  # Default segment

# Define segmentation conditions
data_rfm_segmentation.loc[
    (data_rfm_segmentation['Recency'] < 30) &
    (data_rfm_segmentation['Frequency'] > 10) &
    (data_rfm_segmentation['Monetary'] > 1000), 'Customer_Segment'] = 'High-Value'

data_rfm_segmentation.loc[
    (data_rfm_segmentation['Recency'] > 180), 'Customer_Segment'] = 'Inactive'

# Preview the result
print(data_rfm_segmentation[['Recency', 'Frequency', 'Monetary', 'Customer_Segment']].head())

   Recency  Frequency  Monetary Customer_Segment
0      370         47   5469.93         Inactive
1      370         47   5469.93         Inactive
2      370         47   5469.93         Inactive
3      370         47   5469.93         Inactive
4      370         47   5469.93         Inactive


In [None]:
print(data_rfm_segmentation.head())
print(data_rfm_segmentation.info())
print(data_rfm_segmentation.isnull().sum())
print(data_rfm_segmentation.nunique())
print(data_rfm_segmentation.columns)

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  TotalPrice  \
0 2010-12-01 08:26:00       2.55       17850  United Kingdom       15.30   
1 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   
2 2010-12-01 08:26:00       2.75       17850  United Kingdom       22.00   
3 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   
4 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34   

   DayOfWeek  HourOfDay  Month  Year  Recency  Frequency  Monetary  \
0          2          8     12  2010    

In [None]:
import pandas as pd

# Step 1: Ensure the data is sorted by CustomerID and InvoiceDate
data_target_variable = data_rfm_segmentation.copy()  # Replace with your actual dataframe
data_target_variable = data_target_variable.sort_values(by=['CustomerID', 'InvoiceDate'])

# Step 2: Reset the index to avoid confusion due to old index values
#data_target_variable.reset_index(drop=True, inplace=True)

# Step 3: Shift the 'InvoiceDate' to create 'Next_Purchase_Date' for each customer
data_target_variable['Next_Purchase_Date'] = data_target_variable.groupby('CustomerID')['InvoiceDate'].shift(-1)

# Step 4: Fill NaT values in 'Next_Purchase_Date' (i.e., no next purchase)
data_target_variable['Next_Purchase_Date'].fillna(pd.Timestamp('2100-12-31'), inplace=True)  # Use a far future date

# Step 5: Calculate 'Purchase_Behavior' based on whether the next purchase date is within 30 days
data_target_variable['Purchase_Behavior'] = data_target_variable.apply(
    lambda row: 1 if ((row['Next_Purchase_Date'] - row['InvoiceDate']).days <= 30)
               else 0, axis=1)

# Step 6: Drop unnecessary columns
data_target_variable.drop(columns=['Next_Purchase_Date'], inplace=True)

# Step 7: Display the final result with relevant columns
print(data_target_variable[['CustomerID', 'Recency', 'Frequency', 'Monetary', 'Purchase_Behavior']].head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_target_variable['Next_Purchase_Date'].fillna(pd.Timestamp('2100-12-31'), inplace=True)  # Use a far future date


       CustomerID  Recency  Frequency  Monetary  Purchase_Behavior
14329       12347      365          7    4239.4                  1
14330       12347      365          7    4239.4                  1
14331       12347      365          7    4239.4                  1
14332       12347      365          7    4239.4                  1
14333       12347      365          7    4239.4                  1


In [None]:
print(data_target_variable.head())
print(data_target_variable.info())
print(data_target_variable.columns)

      InvoiceNo StockCode                          Description  Quantity  \
14329    537626     85116      BLACK CANDELABRA T-LIGHT HOLDER        12   
14330    537626     22375    AIRLINE BAG VINTAGE JET SET BROWN         4   
14331    537626     71477    COLOUR GLASS. STAR T-LIGHT HOLDER        12   
14332    537626     22492              MINI PAINT SET VINTAGE         36   
14333    537626     22771  CLEAR DRAWER KNOB ACRYLIC EDWARDIAN        12   

              InvoiceDate  UnitPrice  CustomerID  Country  TotalPrice  \
14329 2010-12-07 14:57:00       2.10       12347  Iceland        25.2   
14330 2010-12-07 14:57:00       4.25       12347  Iceland        17.0   
14331 2010-12-07 14:57:00       3.25       12347  Iceland        39.0   
14332 2010-12-07 14:57:00       0.65       12347  Iceland        23.4   
14333 2010-12-07 14:57:00       1.25       12347  Iceland        15.0   

       DayOfWeek  HourOfDay  Month  Year  Recency  Frequency  Monetary  \
14329          1         14   

In [None]:
# Copy the current DataFrame to a new DataFrame
data_target_variable_dropped = data_target_variable.copy()

# Drop the specified columns
columns_to_drop = ['InvoiceNo', 'StockCode', 'Description', 'InvoiceDate', 'Country']
data_target_variable_dropped.drop(columns=columns_to_drop, inplace=True)

# Preview the updated DataFrame
print(data_target_variable_dropped.head())
print(data_target_variable_dropped.info())
print(data_target_variable_dropped.isnull().sum())
print(data_target_variable_dropped.nunique())
print(data_target_variable_dropped.columns)

       Quantity  UnitPrice  CustomerID  TotalPrice  DayOfWeek  HourOfDay  \
14329        12       2.10       12347        25.2          1         14   
14330         4       4.25       12347        17.0          1         14   
14331        12       3.25       12347        39.0          1         14   
14332        36       0.65       12347        23.4          1         14   
14333        12       1.25       12347        15.0          1         14   

       Month  Year  Recency  Frequency  Monetary Customer_Segment  \
14329     12  2010      365          7    4239.4         Inactive   
14330     12  2010      365          7    4239.4         Inactive   
14331     12  2010      365          7    4239.4         Inactive   
14332     12  2010      365          7    4239.4         Inactive   
14333     12  2010      365          7    4239.4         Inactive   

       Purchase_Behavior  
14329                  1  
14330                  1  
14331                  1  
14332               

In [None]:
# Step 1: Descriptive Statistics for Numerical Features
print(data_target_variable_dropped.describe())

            Quantity      UnitPrice     CustomerID     TotalPrice  \
count  522695.000000  522695.000000  522695.000000  522695.000000   
mean        8.863285       3.186868   15279.938909      16.633481   
std        15.338252       3.119259    1651.394930      26.130683   
min         1.000000       0.120000   12347.000000       0.360000   
25%         1.000000       1.250000   13993.000000       3.900000   
50%         4.000000       2.080000   15159.000000       9.900000   
75%        12.000000       4.130000   16713.000000      17.700000   
max       100.000000      16.630000   18287.000000     179.000000   

           DayOfWeek      HourOfDay          Month           Year  \
count  522695.000000  522695.000000  522695.000000  522695.000000   
mean        2.430056      13.074917       7.554235    2010.921872   
std         1.846508       2.442543       3.508139       0.268373   
min         0.000000       6.000000       1.000000    2010.000000   
25%         1.000000      11.0000

In [None]:
from sklearn.preprocessing import LabelEncoder

data_label_encoded = data_target_variable_dropped.copy()
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to Customer_Segment
data_label_encoded['Customer_Segment'] = label_encoder.fit_transform(data_label_encoded['Customer_Segment'])

# Check the encoded values
print(data_label_encoded[['Customer_Segment']].head())


       Customer_Segment
14329                 0
14330                 0
14331                 0
14332                 0
14333                 0


In [None]:
print(data_label_encoded.head())
print(data_label_encoded.info())

       Quantity  UnitPrice  CustomerID  TotalPrice  DayOfWeek  HourOfDay  \
14329        12       2.10       12347        25.2          1         14   
14330         4       4.25       12347        17.0          1         14   
14331        12       3.25       12347        39.0          1         14   
14332        36       0.65       12347        23.4          1         14   
14333        12       1.25       12347        15.0          1         14   

       Month  Year  Recency  Frequency  Monetary  Customer_Segment  \
14329     12  2010      365          7    4239.4                 0   
14330     12  2010      365          7    4239.4                 0   
14331     12  2010      365          7    4239.4                 0   
14332     12  2010      365          7    4239.4                 0   
14333     12  2010      365          7    4239.4                 0   

       Purchase_Behavior  
14329                  1  
14330                  1  
14331                  1  
14332         

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a copy of the original dataframe to maintain the original intact
data_standardised = data_label_encoded.copy()

# Select numerical features to standardize
numerical_features = ['Quantity', 'UnitPrice', 'TotalPrice', 'Recency', 'Frequency', 'Monetary']

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply standardization to the numerical columns
data_standardised[numerical_features] = scaler.fit_transform(data_standardised[numerical_features])

# Check the first few rows of the standardized dataframe
print(data_standardised[numerical_features].head())


       Quantity  UnitPrice  TotalPrice   Recency  Frequency  Monetary
14329  0.204503  -0.348438    0.327834  0.297904  -0.864218 -0.175825
14330 -0.317069   0.340829    0.014026  0.297904  -0.864218 -0.175825
14331  0.204503   0.020239    0.855949  0.297904  -0.864218 -0.175825
14332  1.769220  -0.813293    0.258949  0.297904  -0.864218 -0.175825
14333  0.204503  -0.620939   -0.062512  0.297904  -0.864218 -0.175825


In [None]:
# Verify the mean and standard deviation after standardization
standardized_stats = data_standardised.describe().loc[['mean', 'std']]
print(standardized_stats)

          Quantity     UnitPrice    CustomerID    TotalPrice  DayOfWeek  \
mean  2.175013e-18  1.853111e-16  15279.938909  3.151594e-16   2.430056   
std   1.000001e+00  1.000001e+00   1651.394930  1.000001e+00   1.846508   

      HourOfDay     Month         Year       Recency     Frequency  \
mean  13.074917  7.554235  2010.921872 -9.744058e-17  3.480021e-17   
std    2.442543  3.508139     0.268373  1.000001e+00  1.000001e+00   

          Monetary  Customer_Segment  Purchase_Behavior  
mean -1.000506e-17          0.006679           0.952940  
std   1.000001e+00          0.081451           0.211767  


In [None]:
print(data_standardised[numerical_features].head())
print(data_standardised.head())
print(data_standardised.info())

       Quantity  UnitPrice  TotalPrice   Recency  Frequency  Monetary
14329  0.204503  -0.348438    0.327834  0.297904  -0.864218 -0.175825
14330 -0.317069   0.340829    0.014026  0.297904  -0.864218 -0.175825
14331  0.204503   0.020239    0.855949  0.297904  -0.864218 -0.175825
14332  1.769220  -0.813293    0.258949  0.297904  -0.864218 -0.175825
14333  0.204503  -0.620939   -0.062512  0.297904  -0.864218 -0.175825
       Quantity  UnitPrice  CustomerID  TotalPrice  DayOfWeek  HourOfDay  \
14329  0.204503  -0.348438       12347    0.327834          1         14   
14330 -0.317069   0.340829       12347    0.014026          1         14   
14331  0.204503   0.020239       12347    0.855949          1         14   
14332  1.769220  -0.813293       12347    0.258949          1         14   
14333  0.204503  -0.620939       12347   -0.062512          1         14   

       Month  Year   Recency  Frequency  Monetary  Customer_Segment  \
14329     12  2010  0.297904  -0.864218 -0.175825   

In [None]:
from sklearn.model_selection import train_test_split

# Step 1: Prepare features (X) and target (Y)
X = data_standardised.drop(columns=['Purchase_Behavior', 'Customer_Segment'])  # Drop target and non-relevant features
y = data_standardised['Purchase_Behavior']  # Target variable

# Step 2: Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the split data
print(f"Training set size: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Test set size: X_test: {X_test.shape}, y_test: {y_test.shape}")


Training set size: X_train: (418156, 11), y_train: (418156,)
Test set size: X_test: (104539, 11), y_test: (104539,)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Step 3: Create and train Logistic Regression model
logreg_model = LogisticRegression(max_iter=500,random_state=42)

# Fit the model
logreg_model.fit(X_train, y_train)

# Step 4: Make predictions on the test set
y_pred = logreg_model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, logreg_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Output the evaluation metrics
print(f"Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Confusion Matrix:\n{cm}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Model Evaluation:
Accuracy: 0.9531
Precision: 0.9531
Recall: 1.0000
F1-Score: 0.9760
ROC-AUC: 0.7753
Confusion Matrix:
[[    0  4903]
 [    2 99634]]


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Create and train Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Fit the model
rf_model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Step 3: Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)

# Output the evaluation metrics
print(f"Random Forest Model Evaluation:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"ROC-AUC: {roc_auc_rf:.4f}")
print(f"Confusion Matrix:\n{cm_rf}")

Random Forest Model Evaluation:
Accuracy: 0.9555
Precision: 0.9605
Recall: 0.9941
F1-Score: 0.9770
ROC-AUC: 0.8929
Confusion Matrix:
[[  835  4068]
 [  587 99049]]


In [None]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl (223.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (201.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.4/201.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.25.1 xgboost-2.1.4


In [None]:
import xgboost as xgb

# Step 1: Create and train XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Fit the model
xgb_model.fit(X_train, y_train)

# Step 2: Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Step 3: Evaluate the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm_xgb = confusion_matrix(y_test, y_pred_xgb)

# Output the evaluation metrics
print(f"XGBoost Model Evaluation:")
print(f"Accuracy: {accuracy_xgb:.4f}")
print(f"Precision: {precision_xgb:.4f}")
print(f"Recall: {recall_xgb:.4f}")
print(f"F1-Score: {f1_xgb:.4f}")
print(f"ROC-AUC: {roc_auc_xgb:.4f}")
print(f"Confusion Matrix:\n{cm_xgb}")

XGBoost Model Evaluation:
Accuracy: 0.9579
Precision: 0.9621
Recall: 0.9950
F1-Score: 0.9783
ROC-AUC: 0.9106
Confusion Matrix:
[[  994  3909]
 [  497 99139]]


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphviz, catboost
Successfully installed catboost-1.2.7 graphviz-0.20.3


In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Step 1: Initialize CatBoostClassifier
catboost_model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, cat_features=[], random_seed=42, verbose=100)

# Step 2: Train the model
catboost_model.fit(X_train, y_train)

# Step 3: Make predictions on the test set
y_pred_catboost = catboost_model.predict(X_test)

# Step 4: Evaluate the model
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
precision_catboost = precision_score(y_test, y_pred_catboost)
recall_catboost = recall_score(y_test, y_pred_catboost)
f1_catboost = f1_score(y_test, y_pred_catboost)
roc_auc_catboost = roc_auc_score(y_test, catboost_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm_catboost = confusion_matrix(y_test, y_pred_catboost)

# Output the evaluation metrics
print(f"CatBoost Model Evaluation:")
print(f"Accuracy: {accuracy_catboost:.4f}")
print(f"Precision: {precision_catboost:.4f}")
print(f"Recall: {recall_catboost:.4f}")
print(f"F1-Score: {f1_catboost:.4f}")
print(f"ROC-AUC: {roc_auc_catboost:.4f}")
print(f"Confusion Matrix:\n{cm_catboost}")

0:	learn: 0.5446041	total: 85.4ms	remaining: 42.6s
100:	learn: 0.1330074	total: 4.87s	remaining: 19.2s
200:	learn: 0.1270502	total: 9.06s	remaining: 13.5s
300:	learn: 0.1233110	total: 12.3s	remaining: 8.15s
400:	learn: 0.1206716	total: 15.5s	remaining: 3.83s
499:	learn: 0.1189592	total: 18.7s	remaining: 0us
CatBoost Model Evaluation:
Accuracy: 0.9580
Precision: 0.9611
Recall: 0.9962
F1-Score: 0.9783
ROC-AUC: 0.9066
Confusion Matrix:
[[  889  4014]
 [  381 99255]]


In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np

# Define the parameter grid for RandomizedSearchCV
param_dist_xgb = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 6, 8, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 1, 10]
}

# Initialize XGBoost Classifier
xgb_model = xgb.XGBClassifier(random_state=42, verbosity=0)

# Perform RandomizedSearchCV with 5-fold cross-validation
random_search_xgb = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist_xgb,
                                       n_iter=10, scoring='roc_auc', cv=5,
                                       random_state=42, n_jobs=-1)

# Fit the model to the training data
random_search_xgb.fit(X_train, y_train)

# Best hyperparameters found
best_params_xgb = random_search_xgb.best_params_
print(f"Best hyperparameters for XGBoost: {best_params_xgb}")

# Train the optimized model using the best parameters
best_xgb_model = xgb.XGBClassifier(**best_params_xgb, random_state=42)
best_xgb_model.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred_optimized_xgb = best_xgb_model.predict(X_test)

# Step 6: Evaluate the optimized model
accuracy_xgb = accuracy_score(y_test, y_pred_optimized_xgb)
precision_xgb = precision_score(y_test, y_pred_optimized_xgb, zero_division=0)  # Added zero_division for safety
recall_xgb = recall_score(y_test, y_pred_optimized_xgb, zero_division=0)
f1_xgb = f1_score(y_test, y_pred_optimized_xgb, zero_division=0)
roc_auc_xgb = roc_auc_score(y_test, best_xgb_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix
cm_xgb = confusion_matrix(y_test, y_pred_optimized_xgb)

# Output the evaluation metrics
print(f"Optimized XGBoost Model Evaluation:")
print(f"Accuracy: {accuracy_xgb:.4f}")
print(f"Precision: {precision_xgb:.4f}")
print(f"Recall: {recall_xgb:.4f}")
print(f"F1-Score: {f1_xgb:.4f}")
print(f"ROC-AUC: {roc_auc_xgb:.4f}")
print(f"Confusion Matrix:\n{cm_xgb}")


Best hyperparameters for XGBoost: {'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.3, 'colsample_bytree': 1.0}
Optimized XGBoost Model Evaluation:
Accuracy: 0.9581
Precision: 0.9617
Recall: 0.9957
F1-Score: 0.9784
ROC-AUC: 0.9112
Confusion Matrix:
[[  947  3956]
 [  428 99208]]


In [None]:
# Step 1: Make predictions on the test set using the trained model
y_pred_test = best_xgb_model.predict(X_test)

# Step 2: Evaluate the model's performance on the test data
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
roc_auc_test = roc_auc_score(y_test, best_xgb_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix for the test set
cm_test = confusion_matrix(y_test, y_pred_test)

# Output the evaluation metrics for the test set
print(f"Test Set Evaluation - XGBoost Model:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")
print(f"ROC-AUC: {roc_auc_test:.4f}")
print(f"Confusion Matrix:\n{cm_test}")


Test Set Evaluation - XGBoost Model:
Accuracy: 0.9581
Precision: 0.9617
Recall: 0.9957
F1-Score: 0.9784
ROC-AUC: 0.9112
Confusion Matrix:
[[  947  3956]
 [  428 99208]]


In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
import numpy as np

# Define the parameter grid for RandomizedSearchCV (fine-tuned space)
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'n_estimators': [100, 200, 300, 500, 1000],
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [1, 2, 3, 5],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Perform RandomizedSearchCV with 5-fold cross-validation
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist,
                                   n_iter=20, scoring='roc_auc', cv=5,
                                   random_state=42, n_jobs=-1, verbose=1)

# Fit the RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)

# Get the best hyperparameters found by RandomizedSearchCV
best_params_xgb = random_search.best_params_
print(f"Best hyperparameters for XGBoost: {best_params_xgb}")

# Train the optimized XGBoost model using the best hyperparameters
optimized_xgb_model = xgb.XGBClassifier(**best_params_xgb, random_state=42)
optimized_xgb_model.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred_optimized_xgb = optimized_xgb_model.predict(X_test)

# Step 6: Evaluate the optimized model on the test set
accuracy_optimized_xgb = accuracy_score(y_test, y_pred_optimized_xgb)
precision_optimized_xgb = precision_score(y_test, y_pred_optimized_xgb)
recall_optimized_xgb = recall_score(y_test, y_pred_optimized_xgb)
f1_optimized_xgb = f1_score(y_test, y_pred_optimized_xgb)
roc_auc_optimized_xgb = roc_auc_score(y_test, optimized_xgb_model.predict_proba(X_test)[:, 1])

# Confusion matrix for the test data
cm_optimized_xgb = confusion_matrix(y_test, y_pred_optimized_xgb)

# Output the evaluation metrics for the optimized XGBoost model
print(f"Optimized XGBoost Model Evaluation:")
print(f"Accuracy: {accuracy_optimized_xgb:.4f}")
print(f"Precision: {precision_optimized_xgb:.4f}")
print(f"Recall: {recall_optimized_xgb:.4f}")
print(f"F1-Score: {f1_optimized_xgb:.4f}")
print(f"ROC-AUC: {roc_auc_optimized_xgb:.4f}")
print(f"Confusion Matrix:\n{cm_optimized_xgb}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best hyperparameters for XGBoost: {'subsample': 0.9, 'reg_lambda': 1, 'reg_alpha': 0.5, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.6}
Optimized XGBoost Model Evaluation:
Accuracy: 0.9585
Precision: 0.9628
Recall: 0.9949
F1-Score: 0.9786
ROC-AUC: 0.9136
Confusion Matrix:
[[ 1078  3825]
 [  512 99124]]


In [None]:
# Step 1: Use the optimized model to make predictions on the test set
y_pred_test = optimized_xgb_model.predict(X_test)

# Step 2: Evaluate the optimized model on the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
roc_auc_test = roc_auc_score(y_test, optimized_xgb_model.predict_proba(X_test)[:, 1])  # For ROC-AUC, we need probabilities

# Confusion matrix for test data
cm_test = confusion_matrix(y_test, y_pred_test)

# Step 3: Output the evaluation metrics
print(f"Test Set Evaluation - Optimized XGBoost Model:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")
print(f"ROC-AUC: {roc_auc_test:.4f}")
print(f"Confusion Matrix:\n{cm_test}")


Test Set Evaluation - Optimized XGBoost Model:
Accuracy: 0.9585
Precision: 0.9628
Recall: 0.9949
F1-Score: 0.9786
ROC-AUC: 0.9136
Confusion Matrix:
[[ 1078  3825]
 [  512 99124]]


In [None]:
import joblib

# Path to save the model in Google Drive
model_path = '/content/drive/MyDrive/ECommerce Analysis/XGBCustomerChurn.pkl'

# Save the optimized XGBoost model
joblib.dump(optimized_xgb_model, model_path)

print(f"Model saved as {model_path}")


Model saved as /content/drive/MyDrive/ECommerce Analysis/XGBCustomerChurn.pkl
