In [228]:
import numpy as np
import pandas as pd 
import re
from sklearn.model_selection import train_test_split,StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [229]:
df_train = pd.read_csv("../input/credit-dset/train.csv")

  df_train = pd.read_csv("../input/credit-dset/train.csv")


In [230]:
columns_to_convert = [
    'Income_Annual', 'Base_Salary_PerMonth', 'Rate_Of_Interest', 
    'Credit_Limit', 'Current_Debt_Outstanding', 'Ratio_Credit_Utilization', 
    'Per_Month_EMI', 'Monthly_Investment', 'Monthly_Balance'
]

df_train[columns_to_convert] = df_train[columns_to_convert].apply(pd.to_numeric, errors='coerce')

df_train.dtypes

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
Number                       object
Profession                   object
Income_Annual               float64
Base_Salary_PerMonth        float64
Total_Bank_Accounts           int64
Total_Credit_Cards            int64
Rate_Of_Interest              int64
Total_Current_Loans          object
Loan_Type                    object
Delay_from_due_date           int64
Total_Delayed_Payments       object
Credit_Limit                float64
Total_Credit_Enquiries      float64
Credit_Mix                   object
Current_Debt_Outstanding    float64
Ratio_Credit_Utilization    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Per_Month_EMI               float64
Monthly_Investment          float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                

In [231]:
int_columns_to_convert = [
    'Age', 'Total_Bank_Accounts', 'Rate_Of_Interest', 
    'Total_Current_Loans', 'Delay_from_due_date', 'Total_Delayed_Payments', 
]

df_train[int_columns_to_convert] = df_train[int_columns_to_convert].apply(pd.to_numeric, errors='coerce')

df_train[int_columns_to_convert] = df_train[int_columns_to_convert].astype('Int64')

df_train.dtypes

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                           Int64
Number                       object
Profession                   object
Income_Annual               float64
Base_Salary_PerMonth        float64
Total_Bank_Accounts           Int64
Total_Credit_Cards            int64
Rate_Of_Interest              Int64
Total_Current_Loans           Int64
Loan_Type                    object
Delay_from_due_date           Int64
Total_Delayed_Payments        Int64
Credit_Limit                float64
Total_Credit_Enquiries      float64
Credit_Mix                   object
Current_Debt_Outstanding    float64
Ratio_Credit_Utilization    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Per_Month_EMI               float64
Monthly_Investment          float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                

In [232]:
df_train['Age'] = df_train['Age'].apply(lambda x: x if 0 <= x <= 100 else np.nan)

df_train['Number'] = df_train['Number'].apply(lambda x: x if re.match(r'^\d{3}-\d{2}-\d{4}$', str(x)) else np.nan)

df_train['Total_Bank_Accounts'] = df_train['Total_Bank_Accounts'].apply(lambda x: x if x > 0 else np.nan)

df_train['Total_Credit_Cards'] = df_train['Total_Credit_Cards'].apply(lambda x: x if x > 0 else np.nan)

df_train['Rate_Of_Interest'] = df_train['Rate_Of_Interest'].apply(lambda x: x if x >= 0 else np.nan)

df_train.shape

(80000, 28)

In [233]:
null_value_percentages=(df_train.isna().sum()/df_train.shape[0])*100
null_value_percentages

ID                           0.00000
Customer_ID                  0.00000
Month                        0.00000
Name                        10.03625
Age                          7.64625
Number                       5.55375
Profession                   0.00000
Income_Annual                6.94375
Base_Salary_PerMonth        15.04000
Total_Bank_Accounts          4.33750
Total_Credit_Cards           0.01375
Rate_Of_Interest             0.00000
Total_Current_Loans          4.77000
Loan_Type                   11.44625
Delay_from_due_date          0.00000
Total_Delayed_Payments       9.74375
Credit_Limit                 2.11375
Total_Credit_Enquiries       1.93625
Credit_Mix                   0.00000
Current_Debt_Outstanding     1.01000
Ratio_Credit_Utilization     0.00000
Credit_History_Age           9.05000
Payment_of_Min_Amount        0.00000
Per_Month_EMI                0.00000
Monthly_Investment           8.85500
Payment_Behaviour            0.00000
Monthly_Balance              1.19625
C

In [234]:
rows_to_drop = null_value_percentages[(null_value_percentages > 0) & (null_value_percentages < 5)].index
rows_to_drop

Index(['Total_Bank_Accounts', 'Total_Credit_Cards', 'Total_Current_Loans',
       'Credit_Limit', 'Total_Credit_Enquiries', 'Current_Debt_Outstanding',
       'Monthly_Balance'],
      dtype='object')

In [235]:
for row in rows_to_drop:
	if(null_value_percentages[row]<5):
		df_train.drop(labels=df_train.index[df_train[row].isna()],inplace=True)

df_train.shape

(68396, 28)

In [236]:
df_test = pd.read_csv("../input/credit-dset/test.csv")
df_train.fillna(df_train.median(numeric_only=True), inplace=True)
null_value_percentages=(df_train.isna().sum()/df_train.shape[0])*100
null_value_percentages

ID                           0.000000
Customer_ID                  0.000000
Month                        0.000000
Name                        10.022516
Age                          0.000000
Number                       5.589508
Profession                   0.000000
Income_Annual                0.000000
Base_Salary_PerMonth         0.000000
Total_Bank_Accounts          0.000000
Total_Credit_Cards           0.000000
Rate_Of_Interest             0.000000
Total_Current_Loans          0.000000
Loan_Type                   11.110299
Delay_from_due_date          0.000000
Total_Delayed_Payments       0.000000
Credit_Limit                 0.000000
Total_Credit_Enquiries       0.000000
Credit_Mix                   0.000000
Current_Debt_Outstanding     0.000000
Ratio_Credit_Utilization     0.000000
Credit_History_Age           9.013685
Payment_of_Min_Amount        0.000000
Per_Month_EMI                0.000000
Monthly_Investment           0.000000
Payment_Behaviour            0.000000
Monthly_Bala

In [237]:
test_ids = df_test['ID'].copy() 

In [238]:
df_test[columns_to_convert] = df_test[columns_to_convert].apply(pd.to_numeric, errors='coerce')

df_test[int_columns_to_convert] = df_test[int_columns_to_convert].apply(pd.to_numeric, errors='coerce')

df_test[int_columns_to_convert] = df_test[int_columns_to_convert].astype('Int64')


In [239]:
for dataset in [df_train, df_test]:
    dataset['Debt_Income_Ratio'] = dataset['Current_Debt_Outstanding'] / dataset['Income_Annual']
    dataset['Income_Credit_Limit_Ratio'] = dataset['Income_Annual'] / dataset['Credit_Limit']
    dataset['Debt_Credit_Limit_Ratio'] = dataset['Current_Debt_Outstanding'] / dataset['Credit_Limit']

    dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
    dataset.fillna(df_train.median(numeric_only=True), inplace=True)

In [240]:
label_encoder = LabelEncoder()
df_train['Credit_Score'] = label_encoder.fit_transform(df_train['Credit_Score'])

In [241]:
df_test['Age'] = df_test['Age'].apply(lambda x: x if 0 <= x <= 100 else np.nan)

df_test['Number'] = df_test['Number'].apply(lambda x: x if re.match(r'^\d{3}-\d{2}-\d{4}$', str(x)) else np.nan)

df_test['Total_Bank_Accounts'] = df_test['Total_Bank_Accounts'].apply(lambda x: x if x > 0 else np.nan)

df_test['Total_Credit_Cards'] = df_test['Total_Credit_Cards'].apply(lambda x: x if x > 0 else np.nan)

df_test['Rate_Of_Interest'] = df_test['Rate_Of_Interest'].apply(lambda x: x if x >= 0 else np.nan)

In [242]:
X_train = df_train.drop(columns='Credit_Score')
y_train = df_train['Credit_Score']

numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

model = XGBClassifier(learning_rate=0.05, max_depth=6, n_estimators=300, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', model)])

# Parameter grid for hyperparameter tuning
param_grid = {
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [6],
    'classifier__n_estimators': [100, 300]
}

# Rerun the grid search with the simpler parameter grid
try:
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    best_pipeline = grid_search.best_estimator_
    print("Best parameters found:", grid_search.best_params_)
    print("Best cross-validation accuracy:", grid_search.best_score_)
except Exception as e:
    print("An error occurred during grid search:", e)


# Make predictions on the test set using the best pipeline
test_predictions = best_pipeline.predict(df_test)
test_predictions_labels = label_encoder.inverse_transform(test_predictions)

# Prepare the submission file
submission = pd.DataFrame({'ID': test_ids, 'Credit_Score': test_predictions_labels})
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!") 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters found: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 6, 'classifier__n_estimators': 300}
Best cross-validation accuracy: 0.7299988510582448
Submission file 'submission.csv' created successfully!


In [3]:
# string -> no of months
def convert_to_months(age_str):
    if pd.isna(age_str):
        return np.nan
    else : 
        parts = age_str.split(' and ')
        years = int(parts[0].split()[0])
        months = int(parts[1].split()[0])
        total_months = (years * 12) + months
        return total_months

In [47]:
for col in ['Current_Debt_Outstanding', 'Income_Annual', 'Credit_Limit', 'Age']:
    train_data[col] = pd.to_numeric(train_data[col].astype(str).str.replace('_', '', regex=False), errors='coerce')
    test_data[col] = pd.to_numeric(test_data[col].astype(str).str.replace('_', '', regex=False), errors='coerce')

NameError: name 'train_data' is not defined

In [4]:
loan_type_col = df_train['Loan_Type']
# dropping columns
df_train = df_train.drop(['Name','Loan_Type'], axis=1)
# base salary -> number
df_train['Base_Salary_PerMonth'] = pd.to_numeric(df_train['Base_Salary_PerMonth'],downcast = 'float',errors = 'coerce')
# Total delayed payments (removing underscores if present) -> number
df_train['Total_Delayed_Payments'] = df_train['Total_Delayed_Payments'].str.replace(r'[^-0-9]', '', regex=True)
df_train['Total_Delayed_Payments'] = pd.to_numeric(df_train['Total_Delayed_Payments'],downcast = 'float',errors = 'coerce')
# credit history age -> number (to no of months)
df_train['Credit_History_Age'] = df_train['Credit_History_Age'].apply(convert_to_months)
df_train['Credit_History_Age'] = pd.to_numeric(df_train['Credit_History_Age'],downcast = 'float',errors = 'coerce')

In [5]:
print(df_train.info(),end = "\n\n")
print(loan_type_col)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        80000 non-null  object 
 1   Customer_ID               80000 non-null  object 
 2   Month                     80000 non-null  object 
 3   Age                       80000 non-null  object 
 4   Number                    80000 non-null  object 
 5   Profession                80000 non-null  object 
 6   Income_Annual             80000 non-null  object 
 7   Base_Salary_PerMonth      67968 non-null  float32
 8   Total_Bank_Accounts       80000 non-null  int64  
 9   Total_Credit_Cards        80000 non-null  int64  
 10  Rate_Of_Interest          80000 non-null  int64  
 11  Total_Current_Loans       80000 non-null  object 
 12  Delay_from_due_date       80000 non-null  int64  
 13  Total_Delayed_Payments    74405 non-null  float32
 14  Credit

In [6]:
df_train.drop_duplicates(inplace=True)
print(df_train.isna().sum().to_string())
print(df_train.shape)

ID                              0
Customer_ID                     0
Month                           0
Age                             0
Number                          0
Profession                      0
Income_Annual                   0
Base_Salary_PerMonth        12032
Total_Bank_Accounts             0
Total_Credit_Cards              0
Rate_Of_Interest                0
Total_Current_Loans             0
Delay_from_due_date             0
Total_Delayed_Payments       5595
Credit_Limit                    0
Total_Credit_Enquiries       1549
Credit_Mix                      0
Current_Debt_Outstanding        0
Ratio_Credit_Utilization        0
Credit_History_Age           7240
Payment_of_Min_Amount           0
Per_Month_EMI                   0
Monthly_Investment           3605
Payment_Behaviour               0
Monthly_Balance               950
Credit_Score                    0
(80000, 26)


In [7]:
null_percentages=(df_train.isna().sum()/df_train.shape[0])*100
null_cols = null_percentages.loc[null_percentages > 0]
null_cols

Base_Salary_PerMonth      15.04000
Total_Delayed_Payments     6.99375
Total_Credit_Enquiries     1.93625
Credit_History_Age         9.05000
Monthly_Investment         4.50625
Monthly_Balance            1.18750
dtype: float64

In [8]:
rows_to_drop = null_cols.loc[null_cols < 5]
df_train.dropna(subset = rows_to_drop.keys(),inplace=True,how='any',axis=0)
print(df_train.isna().sum().to_string())
print(df_train.shape)

ID                              0
Customer_ID                     0
Month                           0
Age                             0
Number                          0
Profession                      0
Income_Annual                   0
Base_Salary_PerMonth        11110
Total_Bank_Accounts             0
Total_Credit_Cards              0
Rate_Of_Interest                0
Total_Current_Loans             0
Delay_from_due_date             0
Total_Delayed_Payments       5184
Credit_Limit                    0
Total_Credit_Enquiries          0
Credit_Mix                      0
Current_Debt_Outstanding        0
Ratio_Credit_Utilization        0
Credit_History_Age           6694
Payment_of_Min_Amount           0
Per_Month_EMI                   0
Monthly_Investment              0
Payment_Behaviour               0
Monthly_Balance                 0
Credit_Score                    0
(74028, 26)


In [9]:
columns_to_drop = null_cols.loc[null_cols > 40]
df_train.drop(columns = columns_to_drop.keys(),inplace = True)
df_train.drop_duplicates(inplace=True)
print(df_train.shape)

(74028, 26)


In [10]:
null_percentages=(df_train.isna().sum()/df_train.shape[0])*100
null_cols = null_percentages.loc[null_percentages > 0]
print(null_cols,end = "\n\n")
col_impute = null_cols.loc[(null_cols >= 5) & (null_cols < 40)]
for column in col_impute.keys():
    central_tend = df_train[column].mean()
    df_train[column] = df_train[column].fillna(central_tend)
print(df_train.info())

Base_Salary_PerMonth      15.007835
Total_Delayed_Payments     7.002756
Credit_History_Age         9.042524
dtype: float64

<class 'pandas.core.frame.DataFrame'>
Index: 74028 entries, 0 to 79999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        74028 non-null  object 
 1   Customer_ID               74028 non-null  object 
 2   Month                     74028 non-null  object 
 3   Age                       74028 non-null  object 
 4   Number                    74028 non-null  object 
 5   Profession                74028 non-null  object 
 6   Income_Annual             74028 non-null  object 
 7   Base_Salary_PerMonth      74028 non-null  float32
 8   Total_Bank_Accounts       74028 non-null  int64  
 9   Total_Credit_Cards        74028 non-null  int64  
 10  Rate_Of_Interest          74028 non-null  int64  
 11  Total_Current_Loans       74028 non-null  object 
 1