In [1]:
import pandas as pd
import os
from os import path
import pickle
import json
import numpy as np
from tqdm import tqdm
import math

In [2]:
df = pd.read_csv("Dataset/datasetWithTarget.csv")

In [3]:
train_cols = df.columns.difference(['customer_types', 'conversion'])
X = df[train_cols] 
y = df['customer_types']

In [4]:
df.head()

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion,customer_types
0,10,142.44,1,0,1,0,2,1,0,2
1,6,329.08,1,1,3,1,1,0,0,0
2,7,180.65,0,1,1,1,1,1,0,2
3,9,675.83,1,0,3,1,1,-1,0,2
4,2,45.34,1,0,2,0,1,1,0,2


In [5]:
targetCol = "customer_types"

In [6]:
#undersampling imports
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours

In [7]:
#other imports
#data partitioning
from sklearn.model_selection import train_test_split, cross_val_score

#classification
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42, stratify=y)


In [9]:
from collections import Counter

In [10]:
Counter(y)

Counter({2: 35562, 0: 19044, 3: 7132, 1: 2262})

In [11]:
from utilsBalancing import makeAndSaveToFileDecisionTreeResults

base dataset classification

In [12]:
clfs, results = makeAndSaveToFileDecisionTreeResults (X_train, y_train, X_test, y_test, "resampled/based")

unbalanced classes above

In [13]:
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({0: 1810, 1: 1810, 2: 1810, 3: 1810})


In [14]:
clfs, results = makeAndSaveToFileDecisionTreeResults (X_res, y_res, X_test, y_test, "resampled/RandomUnderSampler")

In [15]:
from sklearn.cluster import MiniBatchKMeans
from imblearn.under_sampling import ClusterCentroids

In [16]:
cc = ClusterCentroids(
    estimator=MiniBatchKMeans(
        n_init=1, random_state=42
    ), 
    random_state=42
)
X_res, y_res = cc.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({0: 1810, 1: 1810, 2: 1810, 3: 1810})


In [17]:
clfs, results = makeAndSaveToFileDecisionTreeResults (X_res, y_res, X_test, y_test, "resampled/ClusterCentroids")

In [18]:
cnn = CondensedNearestNeighbour(random_state=42, n_jobs=-1)
X_res, y_res = cnn.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

KeyboardInterrupt: 

In [None]:
clfs, results = makeAndSaveToFileDecisionTreeResults (X_res, y_res, X_test, y_test, "resampled/CondensedNearestNeighbour")


In [19]:
tl = TomekLinks(sampling_strategy='not minority', n_jobs=-1)# default value 'not minority': resample all classes but the minority class;
X_res, y_res = tl.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({2: 28144, 0: 15121, 3: 5474, 1: 1810})


In [20]:
clfs, results = makeAndSaveToFileDecisionTreeResults (X_res, y_res, X_test, y_test, "resampled/TomekLinks")


In [21]:
enn = EditedNearestNeighbours()
X_res, y_res = enn.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({2: 25692, 0: 14012, 1: 1810, 3: 1369})


In [22]:
clfs, results = makeAndSaveToFileDecisionTreeResults (X_res, y_res, X_test, y_test, "resampled/EditedNearestNeighbours")


In [23]:
from imblearn.over_sampling import SMOTE


In [24]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({2: 28449, 0: 28449, 3: 28449, 1: 28449})


In [25]:
clfs, results = makeAndSaveToFileDecisionTreeResults (X_res, y_res, X_test, y_test, "resampled/smote")


In [26]:
# Concatenate the features and target into a single DataFrame
resampled_df = pd.concat([X_res, y_res], axis=1)

# Save the DataFrame to a CSV file
resampled_df.to_csv('resampled_dataset.csv', index=False)