In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [24]:
train_path = '../input/credit-dset/clean_trained_outlier.csv'
test_path = '../input/credit-dset/test_cleaned_outlier.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

if (test_path == '../input/credit-dset/test_cleaned.csv'):
    test_df.drop(['Number'], axis=1, inplace=True)

In [25]:
categorical_cols = []
numerical_cols = []

for col in train_df.columns:
    if col != 'Credit_Score':
        if train_df[col].dtype == 'object':
            categorical_cols.append(col)
        else:
            numerical_cols.append(col)
    else:
        print('Skipping Credit_Score column')

print(categorical_cols)
print(numerical_cols)

Skipping Credit_Score column
['Month', 'Profession', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']
['Age', 'Income_Annual', 'Base_Salary_PerMonth', 'Total_Bank_Accounts', 'Total_Credit_Cards', 'Rate_Of_Interest', 'Delay_from_due_date', 'Total_Delayed_Payments', 'Credit_Limit', 'Total_Credit_Enquiries', 'Current_Debt_Outstanding', 'Ratio_Credit_Utilization', 'Credit_History_Age', 'Per_Month_EMI', 'Monthly_Investment', 'Monthly_Balance', 'Payday Loan', 'Mortgage Loan', 'Debt Consolidation Loan', 'Student Loan', 'Not Specified', 'Auto Loan', 'Credit-Builder Loan', 'Personal Loan', 'Home Equity Loan', 'Total_Current_Loans']


In [26]:
# string -> no of months
def convert_to_2_cols(s):
    parts = s.split('_spent_')
    spent = parts[0]
    payment = parts[1].split('_payments')[0]
    return pd.Series([spent,payment])

train_df[['Spent', 'Value_Payments']] = train_df['Payment_Behaviour'].apply(convert_to_2_cols)
train_df.drop(['Payment_Behaviour'], axis=1, inplace=True)

test_df[['Spent', 'Value_Payments']] = test_df['Payment_Behaviour'].apply(convert_to_2_cols)
test_df.drop(['Payment_Behaviour'], axis=1, inplace=True)

In [27]:
encoder = OneHotEncoder(sparse_output=False)

month_encoded = encoder.fit_transform(train_df[['Month']])
month_df = pd.DataFrame(month_encoded, columns=encoder.get_feature_names_out(['Month']))

encoded_columns = encoder.fit_transform(train_df[['Profession', 'Payment_of_Min_Amount']])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['Profession', 'Payment_of_Min_Amount']))
train_df = pd.concat([train_df, encoded_df, month_df], axis=1)

train_df.drop(['Month', 'Profession', 'Payment_of_Min_Amount'], axis=1, inplace=True)

label_encoder = LabelEncoder()
train_df['Credit_Mix'] = label_encoder.fit_transform(train_df[
    'Credit_Mix'
])

train_df['Spent'] = label_encoder.fit_transform(train_df[
    'Spent'
])

train_df['Value_Payments'] = label_encoder.fit_transform(train_df[
    'Value_Payments'
])

In [28]:
X = train_df.drop(['Credit_Score'], axis=1)
y = train_df['Credit_Score']

X.dtypes

Age                          float64
Income_Annual                float64
Base_Salary_PerMonth         float64
Total_Bank_Accounts          float64
Total_Credit_Cards           float64
Rate_Of_Interest             float64
Delay_from_due_date          float64
Total_Delayed_Payments       float64
Credit_Limit                 float64
Total_Credit_Enquiries       float64
Credit_Mix                     int32
Current_Debt_Outstanding     float64
Ratio_Credit_Utilization     float64
Credit_History_Age           float64
Per_Month_EMI                float64
Monthly_Investment           float64
Monthly_Balance              float64
Payday Loan                    int64
Mortgage Loan                  int64
Debt Consolidation Loan        int64
Student Loan                   int64
Not Specified                  int64
Auto Loan                      int64
Credit-Builder Loan            int64
Personal Loan                  int64
Home Equity Loan               int64
Total_Current_Loans            int64
S

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [30]:
from sklearn.preprocessing import StandardScaler

model = LogisticRegression()

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.64155
Confusion Matrix:
 [[2035   73 1471]
 [ 318 2965 2518]
 [1261 1528 7831]]


In [32]:
month_encoded = encoder.fit_transform(test_df[['Month']])
month_df = pd.DataFrame(month_encoded, columns=encoder.get_feature_names_out(['Month']))

encoded_columns = encoder.fit_transform(test_df[['Profession', 'Payment_of_Min_Amount']])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['Profession', 'Payment_of_Min_Amount']))
test_df = pd.concat([test_df, encoded_df, month_df], axis=1)

test_df.drop(['Month', 'Profession', 'Payment_of_Min_Amount'], axis=1, inplace=True)


test_df['Credit_Mix'] = label_encoder.fit_transform(test_df[
    'Credit_Mix'
])

test_df['Spent'] = label_encoder.fit_transform(test_df[
    'Spent'
])

test_df['Value_Payments'] = label_encoder.fit_transform(test_df[
    'Value_Payments'
])

In [33]:
test_df.dtypes
X_test_final = scaler.transform(test_df.drop(['ID'], axis=1))
test_preds = model.predict(X_test_final)

submission = pd.DataFrame({
    'ID': test_df['ID'], 
    'Credit_Score': test_preds  
})


submission.to_csv('logistic_regression_submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.
