In [16]:
import numpy as np
import pandas as pd 
import re
from sklearn.model_selection import train_test_split,StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [17]:
df_train = pd.read_csv("../input/credit-dset/train.csv")

  df_train = pd.read_csv("../input/credit-dset/train.csv")


In [18]:
columns_to_convert = [
    'Income_Annual', 'Base_Salary_PerMonth', 'Rate_Of_Interest', 
    'Credit_Limit', 'Current_Debt_Outstanding', 'Ratio_Credit_Utilization', 
    'Per_Month_EMI', 'Monthly_Investment', 'Monthly_Balance'
]

df_train[columns_to_convert] = df_train[columns_to_convert].apply(pd.to_numeric, errors='coerce')

df_train.dtypes

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
Number                       object
Profession                   object
Income_Annual               float64
Base_Salary_PerMonth        float64
Total_Bank_Accounts           int64
Total_Credit_Cards            int64
Rate_Of_Interest              int64
Total_Current_Loans          object
Loan_Type                    object
Delay_from_due_date           int64
Total_Delayed_Payments       object
Credit_Limit                float64
Total_Credit_Enquiries      float64
Credit_Mix                   object
Current_Debt_Outstanding    float64
Ratio_Credit_Utilization    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Per_Month_EMI               float64
Monthly_Investment          float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                

In [19]:
int_columns_to_convert = [
    'Age', 'Total_Bank_Accounts', 'Rate_Of_Interest', 
    'Total_Current_Loans', 'Delay_from_due_date', 'Total_Delayed_Payments', 
]

df_train[int_columns_to_convert] = df_train[int_columns_to_convert].apply(pd.to_numeric, errors='coerce')

df_train[int_columns_to_convert] = df_train[int_columns_to_convert].astype('Int64')

df_train.dtypes

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                           Int64
Number                       object
Profession                   object
Income_Annual               float64
Base_Salary_PerMonth        float64
Total_Bank_Accounts           Int64
Total_Credit_Cards            int64
Rate_Of_Interest              Int64
Total_Current_Loans           Int64
Loan_Type                    object
Delay_from_due_date           Int64
Total_Delayed_Payments        Int64
Credit_Limit                float64
Total_Credit_Enquiries      float64
Credit_Mix                   object
Current_Debt_Outstanding    float64
Ratio_Credit_Utilization    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Per_Month_EMI               float64
Monthly_Investment          float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                

In [20]:
df_train['Age'] = df_train['Age'].apply(lambda x: x if 0 <= x <= 100 else np.nan)

df_train['Number'] = df_train['Number'].apply(lambda x: x if re.match(r'^\d{3}-\d{2}-\d{4}$', str(x)) else np.nan)

df_train['Total_Bank_Accounts'] = df_train['Total_Bank_Accounts'].apply(lambda x: x if x > 0 else np.nan)

df_train['Total_Credit_Cards'] = df_train['Total_Credit_Cards'].apply(lambda x: x if x > 0 else np.nan)

df_train['Rate_Of_Interest'] = df_train['Rate_Of_Interest'].apply(lambda x: x if x >= 0 else np.nan)

df_train.shape

(80000, 28)

In [21]:
null_value_percentages=(df_train.isna().sum()/df_train.shape[0])*100
null_value_percentages

ID                           0.00000
Customer_ID                  0.00000
Month                        0.00000
Name                        10.03625
Age                          7.64625
Number                       5.55375
Profession                   0.00000
Income_Annual                6.94375
Base_Salary_PerMonth        15.04000
Total_Bank_Accounts          4.33750
Total_Credit_Cards           0.01375
Rate_Of_Interest             0.00000
Total_Current_Loans          4.77000
Loan_Type                   11.44625
Delay_from_due_date          0.00000
Total_Delayed_Payments       9.74375
Credit_Limit                 2.11375
Total_Credit_Enquiries       1.93625
Credit_Mix                   0.00000
Current_Debt_Outstanding     1.01000
Ratio_Credit_Utilization     0.00000
Credit_History_Age           9.05000
Payment_of_Min_Amount        0.00000
Per_Month_EMI                0.00000
Monthly_Investment           8.85500
Payment_Behaviour            0.00000
Monthly_Balance              1.19625
C

In [22]:
rows_to_drop = null_value_percentages[(null_value_percentages > 0) & (null_value_percentages < 5)].index
rows_to_drop

Index(['Total_Bank_Accounts', 'Total_Credit_Cards', 'Total_Current_Loans',
       'Credit_Limit', 'Total_Credit_Enquiries', 'Current_Debt_Outstanding',
       'Monthly_Balance'],
      dtype='object')

In [23]:
for row in rows_to_drop:
	if(null_value_percentages[row]<5):
		df_train.drop(labels=df_train.index[df_train[row].isna()],inplace=True)

df_train.shape

(68396, 28)

In [24]:
df_test = pd.read_csv("../input/credit-dset/test.csv")
df_train.fillna(df_train.median(numeric_only=True), inplace=True)
null_value_percentages=(df_train.isna().sum()/df_train.shape[0])*100
null_value_percentages

ID                           0.000000
Customer_ID                  0.000000
Month                        0.000000
Name                        10.022516
Age                          0.000000
Number                       5.589508
Profession                   0.000000
Income_Annual                0.000000
Base_Salary_PerMonth         0.000000
Total_Bank_Accounts          0.000000
Total_Credit_Cards           0.000000
Rate_Of_Interest             0.000000
Total_Current_Loans          0.000000
Loan_Type                   11.110299
Delay_from_due_date          0.000000
Total_Delayed_Payments       0.000000
Credit_Limit                 0.000000
Total_Credit_Enquiries       0.000000
Credit_Mix                   0.000000
Current_Debt_Outstanding     0.000000
Ratio_Credit_Utilization     0.000000
Credit_History_Age           9.013685
Payment_of_Min_Amount        0.000000
Per_Month_EMI                0.000000
Monthly_Investment           0.000000
Payment_Behaviour            0.000000
Monthly_Bala

In [25]:
test_ids = df_test['ID'].copy() 

In [26]:
df_test[columns_to_convert] = df_test[columns_to_convert].apply(pd.to_numeric, errors='coerce')

df_test[int_columns_to_convert] = df_test[int_columns_to_convert].apply(pd.to_numeric, errors='coerce')

df_test[int_columns_to_convert] = df_test[int_columns_to_convert].astype('Int64')


In [27]:
for dataset in [df_train, df_test]:
    dataset['Debt_Income_Ratio'] = dataset['Current_Debt_Outstanding'] / dataset['Income_Annual']
    dataset['Income_Credit_Limit_Ratio'] = dataset['Income_Annual'] / dataset['Credit_Limit']
    dataset['Debt_Credit_Limit_Ratio'] = dataset['Current_Debt_Outstanding'] / dataset['Credit_Limit']

    dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
    dataset.fillna(df_train.median(numeric_only=True), inplace=True)

In [28]:
label_encoder = LabelEncoder()
df_train['Credit_Score'] = label_encoder.fit_transform(df_train['Credit_Score'])

In [29]:
df_test['Age'] = df_test['Age'].apply(lambda x: x if 0 <= x <= 100 else np.nan)

df_test['Number'] = df_test['Number'].apply(lambda x: x if re.match(r'^\d{3}-\d{2}-\d{4}$', str(x)) else np.nan)

df_test['Total_Bank_Accounts'] = df_test['Total_Bank_Accounts'].apply(lambda x: x if x > 0 else np.nan)

df_test['Total_Credit_Cards'] = df_test['Total_Credit_Cards'].apply(lambda x: x if x > 0 else np.nan)

df_test['Rate_Of_Interest'] = df_test['Rate_Of_Interest'].apply(lambda x: x if x >= 0 else np.nan)

In [None]:
X_train = df_train.drop(columns='Credit_Score')
y_train = df_train['Credit_Score']

numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

model = XGBClassifier(learning_rate=0.05, max_depth=6, n_estimators=300, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', model)])

# Parameter grid for hyperparameter tuning
param_grid = {
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [6],
    'classifier__n_estimators': [100, 300]
}

grid_search = GridSearchCV(pipeline, param_grid, , scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Rerun the grid search with the simpler parameter grid
best_pipeline =grid_search.best_estimator_

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)


# Make predictions on the test set using the best pipeline
test_predictions = best_pipeline.predict(df_test)
test_predictions_labels = label_encoder.inverse_transform(test_predictions)

# Prepare the submission file
submission = pd.DataFrame({'ID': test_ids, 'Credit_Score': test_predictions_labels})
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!") 

An error occurred during grid search: name 'cv' is not defined


AttributeError: 'NoneType' object has no attribute 'predict'