
Problem 1: Confirmation of competition contents
What to learn and what to predict?
The goal is to predict the probability of a customer defaulting on a loan. The dataset includes various features related to the client's background, financial history, and other relevant information.

What kind of file to create and submit to Kaggle?
You need to submit a CSV file with two columns: SK_ID_CURR (the unique identifier for each loan application) and TARGET (the predicted probability of default).

What kind of index value will be used to evaluate the submissions?
The submissions will be evaluated based on the area under the ROC curve (AUC-ROC). The AUC-ROC score measures the model's ability to distinguish between the positive and negative classes.

Problem 2: Creating a baseline model


In [3]:
pip install scikit-learn pandas


Note: you may need to restart the kernel to use updated packages.


In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load the training data
train_data = pd.read_csv("application_train.csv")

# Basic analysis
print(train_data.shape)
print(train_data.info())

# Select features and target variable
X = train_data.drop(['TARGET'], axis=1)
y = train_data['TARGET']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Combine preprocessing with the logistic regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict_proba(X_val)[:, 1]

# Evaluate using AUC-ROC
auc_roc = roc_auc_score(y_val, y_pred)
print(f"AUC-ROC Score: {auc_roc}")


(307511, 122)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


AUC-ROC Score: 0.7453642994738289


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load the training data
train_data = pd.read_csv("application_train.csv")

# Select features and target variable
X = train_data.drop(['TARGET'], axis=1)
y = train_data['TARGET']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Combine preprocessing with the random forest classifier
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict_proba(X_val)[:, 1]

# Evaluate using AUC-ROC
auc_roc = roc_auc_score(y_val, y_pred)
print(f"AUC-ROC Score on Validation Data: {auc_roc}")


AUC-ROC Score on Validation Data: 0.7091623165404071


In [9]:
# Load the test data
test_data = pd.read_csv("./application_test.csv")

# Make predictions on the test set
test_predictions = model.predict_proba(test_data)[:, 1]

# Create a submission dataframe
submission_df = pd.DataFrame({
    'SK_ID_CURR': test_data['SK_ID_CURR'],
    'TARGET': test_predictions
})

# Save the submission file
submission_df.to_csv("credit_default_submission.csv", index=False)

# Display a message indicating successful submission
print("Submission file created successfully.")


Submission file created successfully.


In [11]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load the training data
train_data = pd.read_csv("application_train.csv")

# Feature Engineering - Pattern 1: Adding interaction terms
train_data['EXT_SOURCE_3 * DAYS_BIRTH'] = train_data['EXT_SOURCE_3'] * train_data['DAYS_BIRTH']

# Feature Engineering - Pattern 2: Creating a debt-to-income ratio feature
train_data['debt_to_income_ratio'] = train_data['AMT_CREDIT'] / train_data['AMT_INCOME_TOTAL']

# Feature Engineering - Pattern 3: Handling family size outliers
train_data['CNT_FAM_MEMBERS'] = train_data['CNT_FAM_MEMBERS'].clip(1, 5)

# Feature Engineering - Pattern 4: Creating a binary flag for missing values in EXT_SOURCE_3
train_data['EXT_SOURCE_3_missing'] = train_data['EXT_SOURCE_3'].isnull()

# Feature Engineering - Pattern 5: Log-transforming skewed features
skewed_features = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY']
for feature in skewed_features:
    train_data[feature + '_log'] = np.log1p(train_data[feature])

# Select features and target variable
X = train_data.drop(['TARGET'], axis=1)
y = train_data['TARGET']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Combine preprocessing with the random forest classifier
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict_proba(X_val)[:, 1]

# Evaluate using AUC-ROC
auc_roc = roc_auc_score(y_val, y_pred)
print(f"AUC-ROC Score on Validation Data (Feature Engineering): {auc_roc}")


AUC-ROC Score on Validation Data (Feature Engineering): 0.7080162649654906
