In [148]:
import pandas as pd
import numpy as np

# Import libraries for tools
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder,OneHotEncoder,LabelEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Model Training 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [103]:
data = pd.read_csv(r'F:\Machine_learning\Credit_Card_Default_Prediction_and_Model_Deployment_Project\notebooks\data\UCI_Credit_Card_updated.csv',index_col="ID")
data_train,data_test = train_test_split(data,random_state=42,test_size=.15)
df= data.copy()
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   LIMIT_BAL                   30000 non-null  float64
 1   SEX                         30000 non-null  object 
 2   EDUCATION                   30000 non-null  object 
 3   MARRIAGE                    30000 non-null  object 
 4   AGE                         30000 non-null  int64  
 5   PAY_0                       30000 non-null  int64  
 6   PAY_2                       30000 non-null  int64  
 7   PAY_3                       30000 non-null  int64  
 8   PAY_4                       30000 non-null  int64  
 9   PAY_5                       30000 non-null  int64  
 10  PAY_6                       30000 non-null  int64  
 11  BILL_AMT1                   30000 non-null  float64
 12  BILL_AMT2                   30000 non-null  float64
 13  BILL_AMT3                   300

In [104]:
df.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default.payment.next.month'],
      dtype='object')

In [105]:
BILL = ['BILL_AMT1', 'BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
for i in BILL:
    lower_bound = df[i].quantile(0.05)  
    upper_bound = df[i].quantile(0.96)
    df_filtered = df[(df[i] >= lower_bound) & (df[i] <= upper_bound)]

PAY_AMT = ['PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'] 
for i in PAY_AMT:
    lower_bound = df_filtered[i].quantile(0.000)  
    upper_bound = df_filtered[i].quantile(0.985)
    df_filtered = df_filtered[(df_filtered[i] >= lower_bound) & (df_filtered[i] <= upper_bound)]

In [112]:
X_train,X_test= train_test_split(df_filtered,test_size=.2,random_state=42)

In [125]:
#imputation transformer
trf1 = ColumnTransformer(
    [
        ('impute_numerical_columns1',SimpleImputer(strategy='median'),[0]),
        ('impute_categorical_columns1',SimpleImputer(strategy='most_frequent'),[1,2,3,5,6,7,8,9,10]),
        ('impute_numerical_columns2',SimpleImputer(strategy='median'),[4]),
        ('impute_categorical_columns2',SimpleImputer(strategy='most_frequent'),[5,6,7,8,9,10]),
        ('impute_numerical_columns3',SimpleImputer(strategy='median'),[11,12,13,14,15,16,17,18,19,20,21,22])
    ]
                         ,remainder='passthrough')

In [131]:
trf2 = ColumnTransformer(
    [
        ('one_hot_encoding1',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,2,3]),
        ('one_hot_encoding2',OneHotEncoder(sparse=False,handle_unknown='ignore'),[5,6,7,8,9,10])
        
    ]
                         ,remainder='passthrough')

In [132]:
trf3 = ColumnTransformer(
    [
        ('yao_jhonson_transformation1',PowerTransformer(),[0,4,11,12,13,14,15,16,17,18,19,20,21,22])
],remainder="passthrough")

In [133]:
pipe = Pipeline(
    [
        ('trf1',trf1),
        ('trf2',trf2),
        ('trf3',trf3)
    ]
    )

In [134]:
X_train_transformed = pipe.fit_transform(X_train)
X_test_transformed = pipe.transform(X_test)



In [144]:
X_train = X_train_transformed[:,:-1]
Y_train = X_train_transformed[:,-1]
X_test = X_test_transformed[:,:-1]
Y_test = X_test_transformed[:,-1]

In [145]:
RFC = RandomForestClassifier()
RFC.fit(X_train,Y_train)

In [150]:
Y_predict = RFC.predict(X_test)

In [151]:
accuracy_score(Y_test,Y_predict)

0.8046738072054528