In [1]:
import pandas as pd
import numpy as np

# Import libraries for tools
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder,OneHotEncoder,LabelEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Model Training 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

# For hyper Parameter tunning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv(r'D:\Machine_learning_projects\notebooks\data\UCI_Credit_Card_updated.csv',index_col="ID")
data_train,data_test = train_test_split(data,random_state=42,test_size=.15)
df= data.copy()
data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   LIMIT_BAL                   30000 non-null  float64
 1   SEX                         30000 non-null  object 
 2   EDUCATION                   30000 non-null  object 
 3   MARRIAGE                    30000 non-null  object 
 4   AGE                         30000 non-null  int64  
 5   PAY_0                       30000 non-null  int64  
 6   PAY_2                       30000 non-null  int64  
 7   PAY_3                       30000 non-null  int64  
 8   PAY_4                       30000 non-null  int64  
 9   PAY_5                       30000 non-null  int64  
 10  PAY_6                       30000 non-null  int64  
 11  BILL_AMT1                   30000 non-null  float64
 12  BILL_AMT2                   30000 non-null  float64
 13  BILL_AMT3                   30000 no

In [3]:
df.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default.payment.next.month'],
      dtype='object')

In [4]:
BILL = ['BILL_AMT1', 'BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
for i in BILL:
    lower_bound = df[i].quantile(0.0)  
    upper_bound = df[i].quantile(1)
    df_filtered = df[(df[i] >= lower_bound) & (df[i] <= upper_bound)]

PAY_AMT = ['PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'] 
for i in PAY_AMT:
    lower_bound = df_filtered[i].quantile(0.0)  
    upper_bound = df_filtered[i].quantile(1)
    df_filtered = df_filtered[(df_filtered[i] >= lower_bound) & (df_filtered[i] <= upper_bound)]

In [5]:
X_train,X_test,Y_train,Y_test= train_test_split(df_filtered.drop(columns=['default.payment.next.month']),df_filtered['default.payment.next.month'],test_size=.2,random_state=42)

In [6]:
df = pd.DataFrame(X_train)
df.head().T

ID,21754,252,22942,619,17091
LIMIT_BAL,80000.0,30000.0,180000.0,60000.0,130000.0
SEX,Female,Male,Female,Male,Female
EDUCATION,University,University,Other,Graduate School,University
MARRIAGE,Single,Single,Married,Single,Single
AGE,24,28,44,25,25
PAY_0,0,0,0,0,0
PAY_2,0,0,0,0,0
PAY_3,0,0,-1,0,0
PAY_4,0,0,-1,0,0
PAY_5,0,0,-1,0,0


In [7]:
#imputation transformer
trf1 = ColumnTransformer(
    [
        ('impute_numerical_columns1',SimpleImputer(strategy='median'),[0]),
        ('impute_categorical_columns1',SimpleImputer(strategy='most_frequent'),[1,2,3]),
        ('impute_numerical_columns2',SimpleImputer(strategy='median'),[4]),
        ('impute_categorical_columns2',SimpleImputer(strategy='most_frequent'),[5,6,7,8,9,10]),
        ('impute_numerical_columns3',SimpleImputer(strategy='median'),[11,12,13,14,15,16,17,18,19,20,21,22])
    ]
                         ,remainder='passthrough')

In [8]:
trf2 = ColumnTransformer(
    [
        ('yao_jhonson_transformation1',PowerTransformer(),[0,4,11,12,13,14,15,16,17,18,19,20,21,22])
        ],remainder="passthrough")

In [9]:
trf3 = ColumnTransformer([
    ('one_hot_encoding1',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[14,15,16,17,18,19,20,21,22])
        ]
    ,remainder='passthrough')

In [10]:
pipe = Pipeline(
    [
        ('trf1',trf1),
        ('trf2',trf2),
        ('trf3',trf3)
    ]
    )

In [11]:
X_train_transformed = pipe.fit_transform(X_train)
X_test_transformed = pipe.transform(X_test)

In [12]:
output = LabelEncoder()
Y_train_transformed =output.fit_transform(Y_train)
Y_test_transformed = output.transform(Y_test)

In [24]:
Y_train_transformed

array([0, 0, 0, ..., 0, 1, 1])

In [13]:
RFC = RandomForestClassifier(n_jobs=-1)
RFC.fit(X_train_transformed,Y_train_transformed)

In [14]:
Y_predict = RFC.predict(X_test_transformed)

In [15]:
accuracy_score(Y_test_transformed,Y_predict)

0.8153333333333334

In [16]:
logitic = LogisticRegression(n_jobs= -1)
logitic.fit(X_train_transformed,Y_train_transformed)
Y_predict_logistic = logitic.predict(X_test_transformed)

In [17]:
accuracy_score(Y_test_transformed,Y_predict_logistic)

0.82

**Hyperparameter tunning**

In [18]:
classifier2 = LogisticRegression()
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  
    'penalty': ['l1', 'l2','elasticnet'],  
    'max_iter': [100, 200, 300], 
    'solver':['lbfgs','saga']
}

model2 = GridSearchCV(estimator=classifier2, param_grid=param_grid_lr, n_jobs=-1, verbose=3, cv=3)

model2.fit(X_train_transformed, Y_train_transformed)

print("Best Parameters:", model2.best_params_)
print("Best Accuracy:", model2.best_score_)


Fitting 3 folds for each of 108 candidates, totalling 324 fits


162 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Statistics\States\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Statistics\States\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Statistics\States\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' pe

Best Parameters: {'C': 1, 'max_iter': 300, 'penalty': 'l1', 'solver': 'saga'}
Best Accuracy: 0.8209583333333333




In [19]:
# classifier = RandomForestClassifier()
# param_drid = {'min_samples_leaf':[15,20,30], 'n_estimators':[75,100,150,200],'max_depth':[None,5,8], 'criterion':['gini','entropy']}
# model = RandomizedSearchCV(estimator=classifier, param_distributions=param_drid,n_jobs=-1,verbose=10)
# model.fit(X_train,Y_train)

In [20]:
classifier1 = RandomForestClassifier()
param_drid = {'min_samples_leaf':[15,20,30], 'n_estimators':[100,200,300,400],'max_depth':[None,5,8], 'criterion':['gini','entropy']}
model1 = GridSearchCV(estimator=classifier1, param_grid=param_drid,n_jobs=-1,verbose=3,cv = 3)
model1.fit(X_train_transformed,Y_train_transformed)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [21]:
model1.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'min_samples_leaf': 15,
 'n_estimators': 300}

In [22]:
RFC2 = RandomForestClassifier(n_estimators=300,min_samples_leaf=15,max_depth= None,criterion= 'gini',n_jobs=-1)
RFC2.fit(X_train_transformed,Y_train_transformed)

In [23]:
Y_predict = RFC2.predict(X_test_transformed)
print(accuracy_score(Y_test_transformed,Y_predict))
print(f1_score(Y_test_transformed,Y_predict))
print(recall_score(Y_test_transformed,Y_predict))

0.821
0.4575757575757576
0.345011424219345


In [21]:
random_forest_classifier = RandomForestClassifier(n_estimators=300,min_samples_leaf=15,max_depth= None,criterion= 'gini',n_jobs=-1)
cv_scores = cross_val_score(random_forest_classifier, X_train_transformed, Y_train_transformed, cv=5)  # Adjust cv as needed

print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())
print(f1_score(Y_test_transformed,Y_predict))

Cross-Validation Scores: [0.818125   0.82333333 0.81458333 0.81916667 0.81708333]
Mean Accuracy: 0.8184583333333334


In [24]:
xgb = XGBClassifier(n_jobs = -1)
xgb.fit(X_train_transformed,Y_train_transformed)

In [25]:
Y_predict = xgb.predict(X_test_transformed)
accuracy_score(Y_test_transformed,Y_predict)

0.817

In [26]:
classifier2 = XGBClassifier()
param_grid_xgb = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5]
}
model2 = GridSearchCV(estimator=classifier2, param_grid=param_grid_xgb, n_jobs=-1, verbose=3, cv=3)
model2.fit(X_train_transformed, Y_train_transformed)


Fitting 3 folds for each of 405 candidates, totalling 1215 fits


In [27]:
model2.best_params_

{'colsample_bytree': 0.8,
 'gamma': 5,
 'max_depth': 4,
 'min_child_weight': 5,
 'subsample': 1.0}

In [28]:
xgb = XGBClassifier(colsample_bytree = 0.8,gamma= 5,max_depth= 4,min_child_weight= 5,subsample= 1.0,n_jobs = -1)
xgb.fit(X_train_transformed,Y_train_transformed)
Y_predict = xgb.predict(X_test_transformed)
accuracy_score(Y_test_transformed,Y_predict)

0.8193333333333334

In [20]:
xgb_classifier = XGBClassifier(colsample_bytree = 0.8,gamma= 5,max_depth= 4,min_child_weight= 5,subsample= 1.0,n_jobs = -1)
cv_scores = cross_val_score(xgb_classifier, X_train_transformed, Y_train_transformed, cv=5)  # Adjust cv as needed

print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

Cross-Validation Scores: [0.81875    0.82395833 0.81666667 0.81708333 0.82041667]
Mean Accuracy: 0.819375


In [14]:


# Assuming you have X_train_transformed and Y_train_transformed defined

# Specify the GPU device ID based on the information obtained
gpu_device_id = 0  # GPU ID is 0 in this case

# Create XGBoost classifier with GPU support
xgb_classifier = XGBClassifier(
    colsample_bytree=0.8,
    gamma=5,
    max_depth=4,
    min_child_weight=5,
    subsample=1.0,
    n_jobs=-1,
    tree_method='gpu_hist',
    gpu_id=gpu_device_id
)

# Perform cross-validation
cv_scores = cross_val_score(xgb_classifier, X_train_transformed, Y_train_transformed, cv=5)

# Print the results
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())




    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Cross-Validation Scores: [0.81833333 0.82416667 0.8175     0.81729167 0.81875   ]
Mean Accuracy: 0.8192083333333333



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



In [20]:
xgb_classifier1 = XGBClassifier(
    colsample_bytree=0.8,
    gamma=5,
    max_depth=4,
    min_child_weight=5,
    subsample=1.0,
    n_jobs=-1,
    tree_method='gpu_hist',
    gpu_id=gpu_device_id
)

xgb_classifier1.fit(X_train_transformed,Y_train_transformed)
Y_predict = xgb_classifier1.predict(X_test_transformed)
accuracy_score(Y_test_transformed,Y_predict)


    E.g. tree_method = "hist", device = "cuda"



0.821

In [21]:
from sklearn.metrics import f1_score,recall_score
print(f1_score(Y_test_transformed,Y_predict))
print(recall_score(Y_test_transformed,Y_predict))

0.4586693548387097
0.3465346534653465


In [29]:
from sklearn.ensemble import StackingClassifier
base_classifiers = [
    ('xgb', XGBClassifier(colsample_bytree = 0.8,gamma= 5,max_depth= 4,min_child_weight= 5,subsample= 1.0,n_jobs = -1,random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=300,min_samples_leaf=15,max_depth= None,criterion= 'gini',n_jobs=-1,random_state=42)),
    ('lr', LogisticRegression(C=1, max_iter= 500,random_state=42,solver = 'saga',penalty='l1'))
]


stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=LogisticRegression(max_iter=500))
stacking_classifier.fit(X_train_transformed, Y_train_transformed)
stacking_predictions = stacking_classifier.predict(X_test_transformed)
accuracy = accuracy_score(Y_test_transformed, stacking_predictions)
print(f"Stacking Classifier Accuracy: {accuracy}")



Stacking Classifier Accuracy: 0.8201666666666667




In [None]:
stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=LogisticRegression(max_iter=500))
cv_scores = cross_val_score(stacking_classifier, X_train_transformed, Y_train_transformed, cv=3)  # Adjust cv as needed

print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())




Finally XGBosst Classifier is relavently giving good result.