In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score, KFold, RepeatedKFold, GridSearchCV
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import icecream as ic
from sklearn.model_selection import train_test_split


# Label Encoded Dataset

In [3]:
# dataset 1
df_copy = pd.read_csv("master_dataset_pca+DTfeatures_LabelEncoded.csv")

In [5]:

# Tested out a memory function given on kaggle to optimize speeds for modelling and stop crashing
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
reduce_mem_usage(df_copy)

Mem. usage decreased to 144.17 Mb (78.7% reduction)


Unnamed: 0,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,PCA_47,PCA_48,PCA_49,hour,dayofweek,quarter,month,dayofyear,dayofmonth,weekofyear
0,0,68.500000,4,10095,0,43,2,39,2,167,...,0.496338,0.335938,-0.236694,0,4,4,12,335,1,48
1,0,29.000000,4,1372,304,43,3,3,2,174,...,0.348389,0.067017,-0.252441,0,4,4,12,335,1,48
2,0,59.000000,4,2833,390,43,4,59,3,179,...,0.007191,0.027664,-0.111511,0,4,4,12,335,1,48
3,0,50.000000,4,13341,467,43,3,15,3,283,...,-0.052826,-0.920898,-0.141479,0,4,4,12,335,1,48
4,0,50.000000,1,2712,414,43,3,3,2,242,...,0.120850,0.103882,0.091370,0,4,4,12,335,1,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0,49.000000,4,4305,0,43,4,109,3,133,...,-0.892578,-0.385498,0.210449,23,3,2,5,151,31,22
590536,0,39.500000,4,7354,125,43,3,107,3,79,...,0.007191,0.027664,-0.111511,23,3,2,5,151,31,22
590537,0,30.953125,4,8621,495,43,3,107,3,99,...,-0.128418,0.087952,-0.087708,23,3,2,5,151,31,22
590538,0,117.000000,4,5297,381,43,3,107,3,220,...,0.816895,1.522461,-0.203613,23,3,2,5,151,31,22


In [7]:
# data splitting function 

def datasplit(dataframe, oversample = False):

  # setting data thresholds for 
  train_df = dataframe[dataframe["month"].isin([12, 1, 2, 3])]  
  val_df = dataframe[dataframe["month"] == 4]    
  test_df = dataframe[dataframe["month"] == 5]  

  # splittting data
  X_train =  train_df.drop(columns=["isFraud"])
  y_train = train_df["isFraud"]
  X_val = val_df.drop(columns=["isFraud"])
  y_val = val_df["isFraud"]
  X_test= test_df.drop(columns=["isFraud"])
  y_test = test_df["isFraud"]
   
  # incase of class inbalance set parameter to true to rebalance

  if oversample:
        ros = RandomOverSampler(sampling_strategy=0.8)
        X_train, y_train = ros.fit_resample(X_train, y_train)


  return X_train, X_val, X_test, y_train, y_val, y_test

# splitting data
X_train, X_val, X_test, y_train, y_val, y_test = datasplit(df_copy, oversample=True)



In [40]:
# # random forest model
RF_Model = RandomForestClassifier()

# parameter grid used 
param_grid1 = {'n_estimators': [10,15],
               'max_depth':[1,3,5]
              }

# fitting pca data
pcasearch= GridSearchCV(RF_Model, param_grid1, scoring="roc_auc", cv = RepeatedKFold(n_splits = 3))
rfbest_model = pcasearch.fit(X_train, y_train).best_estimator_

In [36]:
# displaying the data from label encoded dataset
display(rfbest_model)

In [23]:
# Testing model performance using Log Loss, Accuracy, and ROC-AUC Score

# probability predictions for log loss 
y_pred_proba = rfbest_model.predict_proba(X_val)
y_train_pred_proba = rfbest_model.predict_proba(X_train)

# calculating log loss predictions 
loss_corr = log_loss(y_val, y_pred_proba)
loss_train = log_loss(y_train, y_train_pred_proba)

# Predictions for ROC-AUC Score
y_val_pred_proba = rfbest_model.predict_proba(X_val)[:, 1]
y_train_pred_proba = rfbest_model.predict_proba(X_train)[:, 1]

#calculating ROC-AUC Score
auc_val = roc_auc_score(y_val, y_val_pred_proba)
auc_train = roc_auc_score(y_train, y_train_pred_proba)

# displaying results 
print(f"Log Loss on Label Encoded Training Dataset: {loss_train}")
print(f"Log Loss on Label Encoded Validation Dataset: {loss_corr}")

print("\n")

print(f"AUC-ROC Score on Label Encoded Training Dataset : {auc_train:.4f}")
print(f"AUC-ROC Score on Label Encoded Validation Dataset: {auc_val:.4f}")

print("\n")

print(f"Accuracy Score on Label Encoded Test Dataset: {accuracy_score(y_val, rfbest_model.predict(X_val)):.4f}")

training Log Loss: 0.4823285013725436
validation Log Loss: 0.4221110370887576


Training AUC-ROC: 0.8536
Validation AUC-ROC: 0.8434


# PCA Feature Dataset

In [42]:
# dataset 2
df_copy2 = pd.read_csv("master_dataset_pca_features (4).csv")

In [43]:
# Tested out a memory function given on kaggle to optimize speeds for modelling and stop crashing
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [44]:
reduce_mem_usage(df_copy2)

Mem. usage decreased to 141.92 Mb (78.1% reduction)


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,PCA_40,PCA_41,PCA_42,PCA_43,PCA_44,PCA_45,PCA_46,PCA_47,PCA_48,PCA_49
0,0,86400,68.500000,4,10095,0,43,2,39,2,...,-0.045013,0.048309,0.056854,0.012627,0.181519,-0.167969,0.126465,0.496338,0.335938,-0.236694
1,0,86401,29.000000,4,1372,304,43,3,3,2,...,0.095764,0.173218,0.199951,0.074402,0.161865,-0.218872,0.091309,0.348389,0.067017,-0.252441
2,0,86469,59.000000,4,2833,390,43,4,59,3,...,-0.099609,-0.064087,-0.061768,0.032715,0.100342,-0.132324,0.072205,0.007191,0.027664,-0.111511
3,0,86499,50.000000,4,13341,467,43,3,15,3,...,-0.740234,0.085693,-0.648926,-1.026367,1.054688,0.582031,1.188477,-0.052826,-0.920898,-0.141479
4,0,86506,50.000000,1,2712,414,43,3,3,2,...,-0.070068,-0.030930,-0.020248,0.107117,-0.027588,-0.133545,0.024139,0.120850,0.103882,0.091370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0,15811047,49.000000,4,4305,0,43,4,109,3,...,-0.160278,-0.443848,-0.505371,0.175537,-0.573730,0.323242,-0.120544,-0.892578,-0.385498,0.210449
590536,0,15811049,39.500000,4,7354,125,43,3,107,3,...,-0.099609,-0.064087,-0.061768,0.032715,0.100342,-0.132324,0.072205,0.007191,0.027664,-0.111511
590537,0,15811079,30.953125,4,8621,495,43,3,107,3,...,-0.077454,-0.185913,0.024063,-0.036041,0.170654,-0.082336,0.099182,-0.128418,0.087952,-0.087708
590538,0,15811088,117.000000,4,5297,381,43,3,107,3,...,0.299561,-0.224976,0.923340,-0.368408,1.805664,-1.478516,-1.812500,0.816895,1.522461,-0.203613


In [45]:
# second data splitting function for data that des not include timeseries data

def datasplit2(dataframe, oversample = False):
   
    X = dataframe.drop(columns=[dataframe.columns[0], "TransactionDT"]).values  

    y = dataframe.iloc[:, 0].values
    
    # initial split to create test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # second split for creating validation and training set  
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25) 

    # incase class inbalance in target variable set paramter to true to 
    if oversample:
        ros = RandomOverSampler()
        X_train, y_train = ros.fit_resample(X_train, y_train)

    return X_train, X_test, y_train, y_test, X_val, y_val

In [46]:
X_train2, X_test2, y_train2, y_test2, X_val2, y_val2 = datasplit2(df_copy2, oversample=True)



In [55]:
# Random Forest Model
RF_Model = RandomForestClassifier()

# parameter grid used in grid search
param_grid3 ={'n_estimators': [10,15],


               'max_depth': [1, 5, 10]
              }

# performing gridsearch on over set paramters
pcafsearch= GridSearchCV(RF_Model,param_grid3, scoring="roc_auc", cv = RepeatedKFold(n_splits = 3))

# fitting model to PCA data
rfbest_model3 = pcafsearch.fit(X_train2, y_train2).best_estimator_


In [56]:
# displaying best model from PCA dataset
display(rfbest_model3)

In [57]:
# Testing model performance using Log Loss, Accuracy, and ROC-AUC Score

# probability predictions for log loss 
y_pred_proba2 = rfbest_model3.predict_proba(X_val2)
y_train_pred_proba2 = rfbest_model3.predict_proba(X_train2)

# calculating log loss score
loss_corr2 = log_loss(y_val2, y_pred_proba2)
loss_train2 = log_loss(y_train2, y_train_pred_proba2)


# predictions for ROC-AUC Score
y_val_pred_proba2 = rfbest_model3.predict_proba(X_val2)[:, 1]
y_train_pred_proba2 = rfbest_model3.predict_proba(X_train2)[:, 1]

# calculating ROC-AUC Score
auc_val2 = roc_auc_score(y_val2, y_val_pred_proba2)
auc_train2 = roc_auc_score(y_train2, y_train_pred_proba2)

# displaying data

print(f"Log Loss on PCA Feature Training Dataset: {loss_train2}")
print(f"Log Loss on PCA Feature Validation: {loss_corr2}")

print("\n")
print(f"AUC-ROC Score on PCA Feature Training Dataset: {auc_train2:.4f}")
print(f"AUC-ROC Score on PCA Feature Validation Dataset: {auc_val2:.4f}")

print("\n")
print(f"Accuracy Score on PCA-Feature Test Dataset: {accuracy_score(y_test2, rfbest_model3.predict(X_test2)):.4f}")

training Log Loss: 0.37858435781977695
validation Log Loss: 0.367166891290995


Training PCA - Feature AUC-ROC: 0.9216
Validation PCA-Feature AUC-ROC: 0.8879


0.8857994378026891

# Correlation Dataset

In [1]:
# dataset #3
df_copy3 = pd.read_csv("master_dataset_corr_features.csv")

In [2]:
# Tested out a memory function given on kaggle to optimize speeds for modelling and stop crashing

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
reduce_mem_usage(df_copy3)

Mem. usage decreased to 319.89 Mb (75.1% reduction)


Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.500000,4,10095,500,42,1,38,1,...,130,4,260,4,2,2,2,2,2,1786
1,0,86401,29.000000,4,1372,303,42,2,2,1,...,130,4,260,4,2,2,2,2,2,1786
2,0,86469,59.000000,4,2833,389,42,3,58,2,...,130,4,260,4,2,2,2,2,2,1786
3,0,86499,50.000000,4,13341,466,42,2,14,2,...,130,4,260,4,2,2,2,2,2,1786
4,0,86506,50.000000,1,2712,413,42,2,2,1,...,123,3,164,3,1,0,1,1,1,954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0,15811047,49.000000,4,4305,500,42,3,108,2,...,130,4,260,4,2,2,2,2,2,1786
590536,0,15811049,39.500000,4,7354,124,42,2,106,2,...,130,4,260,4,2,2,2,2,2,1786
590537,0,15811079,30.953125,4,8621,494,42,2,106,2,...,130,4,260,4,2,2,2,2,2,1786
590538,0,15811088,117.000000,4,5297,380,42,2,106,2,...,130,4,260,4,2,2,2,2,2,1786


In [4]:
# data splitting function 
def datasplit2(dataframe, oversample = False):
   
    X = dataframe.drop(columns=[dataframe.columns[0], "TransactionDT"]).values  

    y = dataframe.iloc[:, 0].values
    
    # creating initial split creating test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # second split for train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25) 

    # incase of class inbalance set parameter to true to rebalance
    if oversample:
        ros = RandomOverSampler()
        X_train, y_train = ros.fit_resample(X_train, y_train)

    return X_train, X_test, y_train, y_test, X_val, y_val

In [5]:
X_train3, X_test3, y_train3, y_test3, X_val3, y_val3 = datasplit2(df_copy3, oversample=True)



In [17]:
# setting random forest model
RF_Model = RandomForestClassifier()

# parameter grid used for search
param_grid4 ={'n_estimators': [5,10]

               'max_depth': [1, 5, 10]
              }

# fitting model to Correlation Data
pcafsearch= GridSearchCV(RF_Model,param_grid4, scoring="roc_auc", cv = RepeatedKFold(n_splits = 3))

rfbest_model4 = pcafsearch.fit(X_train3, y_train3).best_estimator_

# displaying model
display(rfbest_model4)

In [18]:
# Testing model performance using Log Loss, Accuracy, and ROC-AUC Score

# probability predictions for log loss score
y_pred_proba3 = rfbest_model4.predict_proba(X_val3)
y_train_pred_proba3 = rfbest_model4.predict_proba(X_train3)

# Calculating log loss score
loss_corr3 = log_loss(y_val3, y_pred_proba3)
loss_train3 = log_loss(y_train3, y_train_pred_proba3)

# predictions for ROC-AUC Score
y_val_pred_proba3 = rfbest_model4.predict_proba(X_val3)[:, 1]
y_train_pred_proba3 = rfbest_model4.predict_proba(X_train3)[:, 1]

# Calculating ROC-AUC Score
auc_val3 = roc_auc_score(y_val3, y_val_pred_proba3)
auc_train3 = roc_auc_score(y_train3, y_train_pred_proba3)

# displaying results
print(f"Log Loss Score for Correlation Training Dataset: {loss_train3}")
print(f"Log Loss Score for Correlation Validation Dataset: {loss_corr3}")

print("\n")

print(f"Training Cor Data AUC-ROC: {auc_train3:.4f}")
print(f"Validation Cor Data AUC-ROC: {auc_val3:.4f}")

print("\n")

print(f"Validation Accuracy Score: {accuracy_score(y_test3, rfbest_model4.predict(X_test3)):.4f}")

training Cor Data Log Loss: 0.005865255804610744
validation Cor Data Log Loss: 0.24701562908744246


Training Cor Data AUC-ROC: 1.0000
Validation Cor Data AUC-ROC: 0.8937


0.9804585633487994