In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score



In [2]:
df = pd.read_csv("/kaggle/input/dataset04/CreditCard4.csv")

# Feature selection

In [3]:
# Categorical columns
cat_col = [col for col in df.columns if df[col].dtype == 'object']
print('Categorical columns :',cat_col)
# Numerical columns
num_col = [col for col in df.columns if df[col].dtype != 'object']
print('Numerical columns :',num_col)

Categorical columns : ['Gender', 'MaritalStatus', 'CardColour', 'CardType', 'Domain', 'Customer_City_Address']
Numerical columns : ['AcountNumber', 'CVV', 'CustomerAge', 'Amount', 'AverageIncomeExpendicture', 'Outcome']


In [4]:
from scipy.stats import chi2_contingency

def drop_non_significant_cols(df, target_column, significance_level=0.05):
    dropped_columns = []
    
    for column in df.select_dtypes(include='object'):
        contingency_table = pd.crosstab(df[column], df[target_column])
        chi2, p, _, _ = chi2_contingency(contingency_table)
        
        if p > significance_level:
            df.drop(column, axis=1, inplace=True)
            dropped_columns.append(column)
    
    return df, dropped_columns


df, dropped_columns = drop_non_significant_cols(df, 'Outcome')

print("Dropped Columns:", dropped_columns)

Dropped Columns: ['Gender', 'MaritalStatus', 'CardColour', 'Customer_City_Address']


In [5]:
df.head()

Unnamed: 0,AcountNumber,CVV,CustomerAge,CardType,Domain,Amount,AverageIncomeExpendicture,Outcome
0,1275734409,364,28.0,Verve,Local,129282,170919,0
1,1271246193,401,25.0,Verve,International,574384,329353,1
2,1242290165,266,21.0,Visa,International,190766,292922,0
3,1245478185,402,26.0,Visa,Local,130395,145444,0
4,1258212072,334,28.0,Verve,International,685145,295990,1


# fill null values

In [6]:
df.fillna(method='pad',inplace=True)

  df.fillna(method='pad',inplace=True)


In [7]:
df.isnull().sum()

AcountNumber                 0
CVV                          0
CustomerAge                  0
CardType                     0
Domain                       0
Amount                       0
AverageIncomeExpendicture    0
Outcome                      0
dtype: int64

# one hot encoding

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
print(df.CardType.unique())
print(df.Domain.unique())

['Verve' 'Visa' 'MasterCard']
['Local' 'International']


In [10]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

In [11]:
encoder_df = pd.DataFrame(encoder.fit_transform(df[['CardType']]).toarray())
df = df.join(encoder_df)
df.rename(columns={0: 'Type_MasterCard', 1: 'Type_Verve',2: 'Type_Visa'}, inplace=True)
df.drop('CardType', axis=1, inplace=True)

In [12]:
encoder_df = pd.DataFrame(encoder.fit_transform(df[['Domain']]).toarray())
df = df.join(encoder_df)
df.rename(columns={0: 'Domain_International', 1: 'Domain_Local'}, inplace=True)
df.drop('Domain', axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,AcountNumber,CVV,CustomerAge,Amount,AverageIncomeExpendicture,Outcome,Type_MasterCard,Type_Verve,Type_Visa,Domain_International,Domain_Local
0,1275734409,364,28.0,129282,170919,0,0.0,1.0,0.0,0.0,1.0
1,1271246193,401,25.0,574384,329353,1,0.0,1.0,0.0,1.0,0.0
2,1242290165,266,21.0,190766,292922,0,0.0,0.0,1.0,1.0,0.0
3,1245478185,402,26.0,130395,145444,0,0.0,0.0,1.0,0.0,1.0
4,1258212072,334,28.0,685145,295990,1,0.0,1.0,0.0,1.0,0.0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('Outcome', axis=1), df['Outcome'],
    test_size=0.20, random_state=42)

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Smote

In [16]:
from imblearn.over_sampling import SMOTE
# Display class distribution before SMOTE
print("Class distribution in y_train before SMOTE:\n", y_train.value_counts())
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Display class distribution after SMOTE
print("Class distribution in y_train after SMOTE:\n", y_train_resampled.value_counts())

Class distribution in y_train before SMOTE:
 Outcome
1    21971
0     7706
Name: count, dtype: int64
Class distribution in y_train after SMOTE:
 Outcome
1    21971
0    21971
Name: count, dtype: int64


In [17]:
model =  DecisionTreeClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [18]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8495956873315363
Precision : 0.9009548773637895
Recall    : 0.8912761622522689
F1-score  : 0.8960893854748603


In [19]:
model =  RandomForestClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [20]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8628032345013477
Precision : 0.9419003429493645
Recall    : 0.8647897758844231
F1-score  : 0.9016994978756276


In [21]:
model = AdaBoostClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [22]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8544474393530997
Precision : 0.9532004197271773
Recall    : 0.8412669012780144
F1-score  : 0.8937426210153482


In [23]:
model = GradientBoostingClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [24]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8632075471698113
Precision : 0.9917003140421714
Recall    : 0.8188553435821448
F1-score  : 0.8970274931520746


# Random Oversampling

In [25]:
from imblearn.over_sampling import RandomOverSampler

# Display class distribution before Random Oversampling
print("Class distribution in y_train before Random Oversampling:\n", y_train.value_counts())

# Apply Random Oversampling to the training data
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Display class distribution after Random Oversampling
print("Class distribution in y_train after Random Oversampling:\n", y_train_resampled.value_counts())


Class distribution in y_train before Random Oversampling:
 Outcome
1    21971
0     7706
Name: count, dtype: int64
Class distribution in y_train after Random Oversampling:
 Outcome
1    21971
0    21971
Name: count, dtype: int64


In [26]:
model =  DecisionTreeClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [27]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8439353099730458
Precision : 0.890731527547448
Recall    : 0.8953509909242452
F1-score  : 0.8930352854239794


In [28]:
model =  RandomForestClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [29]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8622641509433963
Precision : 0.9556527170518426
Recall    : 0.8501574365623263
F1-score  : 0.8998235640070574


In [30]:
model = AdaBoostClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [31]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8574123989218329
Precision : 0.9773477017813943
Recall    : 0.823115391739211
F1-score  : 0.8936255781218582


In [32]:
model = GradientBoostingClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [33]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.863477088948787
Precision : 0.9997721057429353
Recall    : 0.8125578810890906
F1-score  : 0.8964953509757841


# ADASYN

In [34]:
from imblearn.over_sampling import ADASYN

# Display class distribution before ADASYN
print("Class distribution in y_train before ADASYN:\n", y_train.value_counts())

# Apply ADASYN to the training data
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Display class distribution after ADASYN
print("Class distribution in y_train after ADASYN:\n", y_train_resampled.value_counts())

Class distribution in y_train before ADASYN:
 Outcome
1    21971
0     7706
Name: count, dtype: int64
Class distribution in y_train after ADASYN:
 Outcome
1    21971
0    21965
Name: count, dtype: int64


In [35]:
model =  DecisionTreeClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [36]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8486522911051213
Precision : 0.8991784914115011
Recall    : 0.8920170401926283
F1-score  : 0.895583449558345


In [37]:
model =  RandomForestClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [38]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.863477088948787
Precision : 0.9484662576687116
Recall    : 0.8590479718466383
F1-score  : 0.9015453396831569


In [39]:
model = AdaBoostClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [40]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8505390835579515
Precision : 0.9532967032967034
Recall    : 0.8355250972402297
F1-score  : 0.8905340045405191


In [41]:
model = GradientBoostingClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [42]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8630727762803234
Precision : 0.9914779098452567
Recall    : 0.8188553435821448
F1-score  : 0.8969364982755123


# borderline smote

In [43]:
from imblearn.over_sampling import BorderlineSMOTE
# Display class distribution before BorderlineSMOTE
print("Class distribution in y_train before BorderlineSMOTE:\n", y_train.value_counts())

# Apply BorderlineSMOTE to the training data
borderline_smote = BorderlineSMOTE(random_state=42)
X_train_resampled, y_train_resampled = borderline_smote.fit_resample(X_train, y_train)

# Display class distribution after BorderlineSMOTE
print("Class distribution in y_train after BorderlineSMOTE:\n", y_train_resampled.value_counts())

Class distribution in y_train before BorderlineSMOTE:
 Outcome
1    21971
0     7706
Name: count, dtype: int64
Class distribution in y_train after BorderlineSMOTE:
 Outcome
1    21971
0    21971
Name: count, dtype: int64


In [44]:
model =  DecisionTreeClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [45]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8514824797843665
Precision : 0.9021149167134569
Recall    : 0.8927579181329875
F1-score  : 0.8974120275553901


In [46]:
model =  RandomForestClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [47]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8575471698113207
Precision : 0.9428804569563444
Recall    : 0.856084460085201
F1-score  : 0.8973886030482477


In [48]:
model = AdaBoostClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [49]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8452830188679246
Precision : 0.9536819637139808
Recall    : 0.8275606593813669
F1-score  : 0.886156287187624


In [50]:
model = GradientBoostingClassifier()
model.fit(X_train_resampled, y_train_resampled)

In [51]:
# Make predictions on test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy   :", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision :", precision)
recall = recall_score(y_test, y_pred)
print("Recall    :", recall)
F1_score = f1_score(y_test, y_pred)
print("F1-score  :", F1_score)

Accuracy   : 0.8619946091644205
Precision : 0.987736900780379
Recall    : 0.8205223189479534
F1-score  : 0.8963982193443949
