In [1]:
import pandas as pd
import numpy as np
import os
from faker import Faker
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))

In [2]:
# Tentukan folder tempat file CSV disimpan
media_folder = '../media/'

# Ambil daftar file yang ada di folder
csv_files = [f for f in os.listdir(media_folder) if f.endswith('.csv')]

# Dapatkan file yang paling baru berdasarkan timestamp di nama file
latest_file = max(csv_files, key=lambda x: os.path.getctime(os.path.join(media_folder, x)))

# Baca file CSV terbaru dengan pandas
df = pd.read_csv(os.path.join(media_folder, latest_file))

# Tampilkan dataframe
df.head()

In [3]:
#Transform variabel menggunakan one hot encoding
encoder = pd.get_dummies(df[['Geography', 'Gender', 'Card Type']])
df = pd.concat([df, encoder], axis = 1)
df = df.drop(['Geography', 'Gender', 'Card Type', 'CustomerId'], axis = 1)

print(df.shape)
print(df.columns)

(10000, 22)
Index(['Surname', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'Complain',
       'Satisfaction Score', 'Point Earned', 'Geography_France',
       'Geography_Germany', 'Geography_Spain', 'Gender_Female', 'Gender_Male',
       'Card Type_DIAMOND', 'Card Type_GOLD', 'Card Type_PLATINUM',
       'Card Type_SILVER'],
      dtype='object')


In [4]:
Z = df['Surname']
df = df.drop(['Surname', 'Complain'], axis=1)
Z.head(3)

0    Hargrave
1        Hill
2        Onio
Name: Surname, dtype: object

In [5]:
#Merubah ke numpy.ndarray
y = df['Exited']
X = df.drop(['Exited'], axis = 1)

print(X.shape)
X.head(3)

(10000, 19)


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Satisfaction Score,Point Earned,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Card Type_DIAMOND,Card Type_GOLD,Card Type_PLATINUM,Card Type_SILVER
0,619,42,2,0.0,1,1,1,101348.88,2,464,True,False,False,True,False,True,False,False,False
1,608,41,1,83807.86,1,0,1,112542.58,3,456,False,False,True,True,False,True,False,False,False
2,502,42,8,159660.8,3,1,0,113931.57,3,377,True,False,False,True,False,True,False,False,False


In [6]:
X_train,X_test, y_train, y_test, Z_train, Z_test = train_test_split(X, y, Z, test_size = 0.3, random_state = 42)

print(X_train.shape)
print(X_test.shape)

(7000, 19)
(3000, 19)


In [7]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(X_resampled.shape)

(11092, 19)


In [8]:
#Faker untuk surname pada data sintesis
fake = Faker()
original_len = len(X_train)
synthetic_len = len(X_resampled) - original_len
surname_resampled = Z_train.reset_index(drop=True).copy()
surname_synthetic = pd.Series([fake.last_name() for _ in range(synthetic_len)], name='Surname')
Z_final = pd.concat([surname_resampled, surname_synthetic], ignore_index=True)
Z_final.head(3)

0     Gordon
1    Wallace
2     McLean
Name: Surname, dtype: object

In [9]:
X_resampled_scaled_df = pd.concat([X_resampled, Z_final], axis = 1)
X_df =pd.concat([X_train, Z_train], axis = 1)
X_test_scaled_df =pd.concat([X_test, Z_test], axis = 1)

In [11]:
train_data_optimasi = pd.concat([X_resampled_scaled_df, y_resampled], axis=1)
train_data_optimasi.to_csv('../data/data_train_optimasi.csv', index=False)

train_data_default = pd.concat([X_df, y_train], axis=1)
train_data_default.to_csv('../data/data_train_default.csv', index=False)

test_data = pd.concat([X_test_scaled_df, y_test], axis=1)
test_data.to_csv('../data/data_test.csv', index=False)

In [12]:
X_resampled_scaled_df.head(3)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Satisfaction Score,Point Earned,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Card Type_DIAMOND,Card Type_GOLD,Card Type_PLATINUM,Card Type_SILVER,Surname
0,619,32,4,175406.13,2,1,1,172792.43,1,707,False,False,True,True,False,True,False,False,False,Gordon
1,643,34,7,160426.07,1,0,1,188533.11,3,806,False,True,False,True,False,False,True,False,False,Wallace
2,561,33,6,0.0,2,0,0,173680.39,3,220,True,False,False,False,True,True,False,False,False,McLean


In [13]:
X_test_scaled_df.head(3)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Satisfaction Score,Point Earned,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Card Type_DIAMOND,Card Type_GOLD,Card Type_PLATINUM,Card Type_SILVER,Surname
6252,596,32,3,96709.07,2,0,0,41788.37,1,709,False,True,False,False,True,False,True,False,False,Anderson
4684,623,43,1,0.0,2,1,1,146379.3,2,508,True,False,False,False,True,False,False,False,True,Herring
1731,601,44,4,0.0,2,1,0,58561.31,1,281,False,False,True,True,False,False,True,False,False,Amechi
