In [238]:
import pandas as pd
from ydata_profiling import ProfileReport

In [258]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [259]:
train.drop(columns=['ID', 'Customer_ID', 'Month', 'Name', 'SSN'], inplace=True)
test.drop(columns=['ID', 'Customer_ID', 'Month', 'Name', 'SSN'], inplace=True)
train.head(2)

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",3.0,...,Good,809.98,26.82262,265.0,No,49.574949,21.46538,High_spent_Small_value_payments,312.494089,Good
1,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",3.0,...,Good,809.98,31.94496,266.0,No,49.574949,21.46538,Low_spent_Large_value_payments,284.629162,Good


In [241]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, 

((80000, 22), (80000,), (20000, 22), (20000,), (50000, 22))

In [242]:
from sklearn.base import BaseEstimator, TransformerMixin

class GetDummies(BaseEstimator, TransformerMixin): 
    def __init__(self, data_sep=',', col_name_sep='_'):
        """
        Transformer that creates dummy variables from categorical columns with a separator.
        Parameters:
            - data_sep (str): Separator used to split categorical values into multiple dummy variables.
            - col_name_sep (str): Separator used to separate the column name from the prefix in the output column names.
        """
        self.data_sep     = data_sep
        self.col_name_sep = col_name_sep
        
    # Return self nothing else to do here
    def fit(self, X, y  = None): 
        """
        Fit the transformer to the data.
        Parameters:
            - X (pandas.DataFrame): Input data with categorical columns.
            - y (array-like): Target variable (ignored).
        Returns:
            - self: Returns the transformer object.
        """
        object_cols       = X.select_dtypes(include="object").columns
        self.dummy_cols   = [col for col in object_cols if X[col].str.contains(self.data_sep, regex=True).any()]
        self.dummy_prefix = [''.join(map(lambda x: x[0], col.split(self.col_name_sep)))  if self.col_name_sep in col else col[:2]   for col in self.dummy_cols]
        
        for col, pre in zip(self.dummy_cols, self.dummy_prefix):
            dummy_X = X.join(X[col].str.get_dummies(sep=self.data_sep).add_prefix(pre+self.col_name_sep))            
            
        dummy_X.drop(columns = self.dummy_cols, inplace=True)
        self.columns = dummy_X.columns
        return self
    
    # Transformer method we wrote for this transformer
    def transform(self, X, y = None):
        """
        Transform the input data by creating dummy variables.
        Parameters:
            - X (pandas.DataFrame): Input data with categorical columns.
            - y (array-like): Target variable (ignored).
        Returns:
            - X_transformed (pandas.DataFrame): Transformed data with dummy variables.
        """
        for col, pre in zip(self.dummy_cols, self.dummy_prefix):
            X_transformed = X.join(X[col].str.get_dummies(sep=self.data_sep).add_prefix(pre+self.col_name_sep))   

        X_transformed = X_transformed.reindex(columns=self.columns, fill_value=0)          
        return X_transformed
        
    # to get feature names    
    def get_feature_names_out(self, input_features=None):
        """
        Get the names of the transformed features.
        Parameters:
            - input_features (array-like): Names of the input features (ignored).
        Returns:
            - output_features (list): Names of the transformed features.
        """
        return self.columns.tolist()

In [243]:
dummy = GetDummies()

X_train_dummy = dummy.fit_transform(X_train)
X_val_dummy   = dummy.transform(X_val)

X_train_dummy.shape, X_val_dummy.shape

((80000, 31), (20000, 31))

In [293]:
cat_columns = X_train_dummy.select_dtypes(include="object").columns.tolist()

# Create dummy variables for train and validation sets
X_train_dummies = pd.get_dummies(X_train_dummy[cat_columns], drop_first=True)
X_val_dummies = pd.get_dummies(X_val_dummy[cat_columns], drop_first=True)

# Concatenate dummy variables with numeric columns
X_train_final = pd.concat([X_train_dummies, X_train_dummy.select_dtypes("number")], axis=1)
X_val_final = pd.concat([X_val_dummies, X_val_dummy.select_dtypes("number")], axis=1)

# Shape of the resulting DataFrames
print("Shape of X_train_final:", X_train_final.shape)
print("Shape of X_val_final:", X_val_final.shape)

Shape of X_train_final: (80000, 50)
Shape of X_val_final: (20000, 50)


In [245]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

le      = LabelEncoder()
print(np.unique(y_train))
y_train = le.fit_transform(y_train)
y_val   = le.transform(y_val)

pd.DataFrame(y_train).value_counts().sort_index()

['Good' 'Poor' 'Standard']


0    14301
1    23124
2    42575
Name: count, dtype: int64

In [288]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the scaler on training data
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_final), columns=X_train_final.columns)

# Transform validation data using the same scaler
X_val_scaled = pd.DataFrame(scaler.transform(X_val_final), columns=X_val_final.columns)

In [257]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

# Initial XGBoost Training
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_scaled, y_train)

# Hyperparameter Tuning using GridSearchCV
model = XGBClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': np.linspace(0.01, 0.3, 2).round(3),
    'max_depth': [5, 6],
}

grid_model_xgb = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    error_score='raise',
    n_jobs=-1,
    cv=5,
)

# Fit the grid search model to find the best hyperparameters
grid_model_xgb.fit(X_train_scaled, y_train)

# Get the best hyperparameters and best estimator
best_params = grid_model_xgb.best_params_
best_xgb_model = grid_model_xgb.best_estimator_

# Evaluate the best model on the validation set (X_val_scaled and y_val)
y_val_pred = best_xgb_model.predict(X_val_scaled)

# Print best hyperparameters and evaluate model performance
print("Best Hyperparameters:", best_params)
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Best Hyperparameters: {'learning_rate': 0.3, 'max_depth': 6, 'n_estimators': 150}
Confusion Matrix:
[[2701   27  799]
 [ 105 4678 1091]
 [ 798 1263 8538]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      3527
           1       0.78      0.80      0.79      5874
           2       0.82      0.81      0.81     10599

    accuracy                           0.80     20000
   macro avg       0.78      0.79      0.79     20000
weighted avg       0.80      0.80      0.80     20000



In [292]:
import pandas as pd

# Define all possible dummy variable column names
dummy_columns = [
    'Occupation_Architect', 'Occupation_Developer', 'Occupation_Doctor',
    'Occupation_Engineer', 'Occupation_Entrepreneur', 'Occupation_Journalist',
    'Occupation_Lawyer', 'Occupation_Manager', 'Occupation_Mechanic',
    'Occupation_Media_Manager', 'Occupation_Musician', 'Occupation_Scientist',
    'Occupation_Teacher', 'Occupation_Writer', 'Credit_Mix_Good',
    'Credit_Mix_Standard', 'Payment_of_Min_Amount_No', 'Payment_of_Min_Amount_Yes',
    'Payment_Behaviour_High_spent_Medium_value_payments',
    'Payment_Behaviour_High_spent_Small_value_payments',
    'Payment_Behaviour_Low_spent_Large_value_payments',
    'Payment_Behaviour_Low_spent_Medium_value_payments',
    'Payment_Behaviour_Low_spent_Small_value_payments',
    'ToL_No Data', 'ToL_auto loan', 'ToL_credit-builder loan',
    'ToL_debt consolidation loan', 'ToL_home equity loan', 'ToL_mortgage loan',
    'ToL_not specified', 'ToL_payday loan', 'ToL_personal loan', 'ToL_student loan'
]

# Initialize user input dictionary with all columns set to 0
user_input = {column: 0 for column in dummy_columns}

# Collect user inputs for numeric features
numeric_features = [
    'Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
    'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
    'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt',
    'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
    'Amount_invested_monthly', 'Monthly_Balance'
]

print("Please enter numeric feature values:")
for feature in numeric_features:
    value = float(input(f"Enter value for '{feature}': "))
    user_input[feature] = value

# Collect user input for Occupation (choose one)
print("Please select your occupation:")
occupations = [
    'Occupation_Architect', 'Occupation_Developer', 'Occupation_Doctor',
    'Occupation_Engineer', 'Occupation_Entrepreneur', 'Occupation_Journalist',
    'Occupation_Lawyer', 'Occupation_Manager', 'Occupation_Mechanic',
    'Occupation_Media_Manager', 'Occupation_Musician', 'Occupation_Scientist',
    'Occupation_Teacher', 'Occupation_Writer'
]

for i, occupation in enumerate(occupations):
    print(f"{i + 1}: {occupation}")
occupation_choice = int(input("Select your occupation (enter the corresponding number): "))
selected_occupation = occupations[occupation_choice - 1]

# Set selected occupation to 1 in user input
user_input[selected_occupation] = 1

# Collect user input for Type_of_Loan (choose multiple)
print("Please select your loan types (choose multiple):")
loan_types = [
    'ToL_No Data', 'ToL_auto loan', 'ToL_credit-builder loan',
    'ToL_debt consolidation loan', 'ToL_home equity loan', 'ToL_mortgage loan',
    'ToL_not specified', 'ToL_payday loan', 'ToL_personal loan', 'ToL_student loan'
]

selected_loan_types = []
while True:
    print("Select a loan type:")
    for i, loan_type in enumerate(loan_types):
        print(f"{i + 1}: {loan_type}")
    loan_choice = input("Enter the corresponding number or 'done' to finish: ")
    if loan_choice == 'done':
        break
    else:
        loan_choice = int(loan_choice)
        selected_loan_type = loan_types[loan_choice - 1]
        selected_loan_types.append(selected_loan_type)

# Set selected loan types to 1 in user input
for loan_type in selected_loan_types:
    user_input[loan_type] = 1

# Create a DataFrame from user input
user_df = pd.DataFrame(user_input, index=[0])

correct_column_order = [
    'Occupation_Architect', 'Occupation_Developer', 'Occupation_Doctor',
    'Occupation_Engineer', 'Occupation_Entrepreneur', 'Occupation_Journalist',
    'Occupation_Lawyer', 'Occupation_Manager', 'Occupation_Mechanic',
    'Occupation_Media_Manager', 'Occupation_Musician', 'Occupation_Scientist',
    'Occupation_Teacher', 'Occupation_Writer', 'Credit_Mix_Good',
    'Credit_Mix_Standard', 'Payment_of_Min_Amount_No', 'Payment_of_Min_Amount_Yes',
    'Payment_Behaviour_High_spent_Medium_value_payments',
    'Payment_Behaviour_High_spent_Small_value_payments',
    'Payment_Behaviour_Low_spent_Large_value_payments',
    'Payment_Behaviour_Low_spent_Medium_value_payments',
    'Payment_Behaviour_Low_spent_Small_value_payments', 'Age', 'Annual_Income',
    'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
    'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
    'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt',
    'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
    'Amount_invested_monthly', 'Monthly_Balance', 'ToL_No Data', 'ToL_auto loan',
    'ToL_credit-builder loan', 'ToL_debt consolidation loan', 'ToL_home equity loan',
    'ToL_mortgage loan', 'ToL_not specified', 'ToL_payday loan', 'ToL_personal loan',
    'ToL_student loan'
]
# Reorder the columns in the user_df DataFrame
user_df = user_df[correct_column_order]
# Display the user's input DataFrame

scaler = MinMaxScaler()

# Fit and transform the user's input DataFrame
user_df_scaled = pd.DataFrame(scaler.fit_transform(user_df), columns=user_df.columns)

user_prediction = best_xgb_model.predict(user_df_scaled)

# Assuming user_prediction contains the predicted credit score (0, 1, or 2)
original_labels = ['Good', 'Poor', 'Standard']
predicted_labels = [original_labels[prediction] for prediction in user_prediction]

# Display the predicted credit score labels
print("Predicted Credit Score Labels:", predicted_labels[0])

Please enter numeric feature values:


Enter value for 'Age':  30
Enter value for 'Annual_Income':  100000
Enter value for 'Monthly_Inhand_Salary':  200000
Enter value for 'Num_Bank_Accounts':  2
Enter value for 'Num_Credit_Card':  2
Enter value for 'Interest_Rate':  1
Enter value for 'Num_of_Loan':  0
Enter value for 'Delay_from_due_date':  0
Enter value for 'Num_of_Delayed_Payment':  0
Enter value for 'Changed_Credit_Limit':  0
Enter value for 'Num_Credit_Inquiries':  2
Enter value for 'Outstanding_Debt':  0
Enter value for 'Credit_Utilization_Ratio':  0
Enter value for 'Credit_History_Age':  0
Enter value for 'Total_EMI_per_month':  0
Enter value for 'Amount_invested_monthly':  100000
Enter value for 'Monthly_Balance':  10000


Please select your occupation:
1: Occupation_Architect
2: Occupation_Developer
3: Occupation_Doctor
4: Occupation_Engineer
5: Occupation_Entrepreneur
6: Occupation_Journalist
7: Occupation_Lawyer
8: Occupation_Manager
9: Occupation_Mechanic
10: Occupation_Media_Manager
11: Occupation_Musician
12: Occupation_Scientist
13: Occupation_Teacher
14: Occupation_Writer


Select your occupation (enter the corresponding number):  7


Please select your loan types (choose multiple):
Select a loan type:
1: ToL_No Data
2: ToL_auto loan
3: ToL_credit-builder loan
4: ToL_debt consolidation loan
5: ToL_home equity loan
6: ToL_mortgage loan
7: ToL_not specified
8: ToL_payday loan
9: ToL_personal loan
10: ToL_student loan


Enter the corresponding number or 'done' to finish:  done


Predicted Credit Score Labels: Standard


In [294]:
import pickle

# Save the trained model to a pickle file
with open('xgb_model.pkl', 'wb') as file:
    pickle.dump(best_xgb_model, file)