1. What to learn and predict?

Learn patterns from historical loan application data to predict whether applicants will repay loans or default

Predict probability (0-1) of repayment difficulty for each applicant (binary classification)

2. Submission file requirements:

CSV file with exactly two columns:

SK_ID_CURR (application ID)

TARGET (predicted probability of default)

File name should be "submission.csv"

3. Evaluation metric:

AUC-ROC (Area Under the Receiver Operating Characteristic Curve)

Measures how well model distinguishes defaulters from non-defaulters

Range: 0.5 (random guessing) to 1.0 (perfect prediction)

Higher values indicate better performance

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Loading training data
train = pd.read_csv(r"C:\Users\Admin\Downloads\application_train.csv.zip")
test = pd.read_csv(r"C:\Users\Admin\Downloads\application_test.csv.zip")

#basic info
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("\nTarget distribution (1=Default, 0=Repaid):")
print(train['TARGET'].value_counts(normalize=True))

# Selecting numerical features
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns
numerical_features = numerical_features.drop(['SK_ID_CURR', 'TARGET'])  # Remove ID & target

# Imputing missing values with median
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(train[numerical_features])
y_train = train['TARGET']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

#
X_test = imputer.transform(test[numerical_features])
X_test = scaler.transform(X_test)

# Splittting into train/validation (80-20 split)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Training Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_split, y_train_split)

# Prediction on validation set
y_val_pred = model.predict_proba(X_val_split)[:, 1]  # Probabilities for class 1 (default)

# Evaluating using AUC-ROC
auc_score = roc_auc_score(y_val_split, y_val_pred)
print(f"Validation AUC: {auc_score:.4f}")

# Prediction on test data
test_preds = model.predict_proba(X_test)[:, 1]

# Creating submission file
submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': test_preds
})

# Saving to CSV
submission.to_csv('submission_baseline.csv', index=False)




import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Loading the data
train = pd.read_csv(r"C:\Users\Admin\Downloads\application_train.csv.zip")
test = pd.read_csv(r"C:\Users\Admin\Downloads\application_test.csv.zip")

# Preprocessing
# Selected numerical features only for simplicity
numerical_cols = train.select_dtypes(include=['int64', 'float64']).columns
numerical_features = [col for col in numerical_cols if col not in ['SK_ID_CURR', 'TARGET']]

# Handling missing values (median imputation)
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(train[numerical_features])
y_train = train['TARGET']
X_test = imputer.transform(test[numerical_features])

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Training final model on ALL training data 
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# predictions (probabilities for class 1)
test_preds = model.predict_proba(X_test)[:, 1]

# Creating submission file
submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': test_preds
})

# Saving to CSV
submission.to_csv('submission_logreg_baseline.csv', index=False)



# Import all required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from scipy.sparse import hstack, csr_matrix

# Loading data
train = pd.read_csv(r"C:\Users\Admin\Downloads\application_train.csv.zip")
test = pd.read_csv(r"C:\Users\Admin\Downloads\application_test.csv.zip")

#common preprocessing
num_features = train.select_dtypes(include=['int64','float64']).columns.drop(['SK_ID_CURR','TARGET'])
cat_features = train.select_dtypes(include=['object']).columns
y = train['TARGET']

#preprocessing 
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)

X = imputer.fit_transform(train[num_features])
X = scaler.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
val_pred = model.predict_proba(X_val)[:,1]
print(f"Baseline AUC: {roc_auc_score(y_val, val_pred):.4f}")

#ratio features
for df in [train, test]:
    df['INCOME_CREDIT_RATIO'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['EMPLOYED_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']

features = num_features.tolist() + [
    'INCOME_CREDIT_RATIO', 
    'ANNUITY_INCOME_RATIO',
    'CREDIT_TO_ANNUITY_RATIO',
    'EMPLOYED_TO_BIRTH_RATIO'
]

X = imputer.fit_transform(train[features])
X = scaler.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
val_pred = model.predict_proba(X_val)[:,1]
print(f"Ratio Features AUC: {roc_auc_score(y_val, val_pred):.4f}")

# Preprocessing numerical and categorical separately
X_num = imputer.fit_transform(train[num_features])
X_num = scaler.fit_transform(X_num)
X_cat = encoder.fit_transform(train[cat_features])

# Combine numerical and categorical
X = hstack([csr_matrix(X_num), X_cat])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LGBMClassifier()
model.fit(X_train, y_train)
val_pred = model.predict_proba(X_val)[:,1]
print(f"Categorical Encoding AUC: {roc_auc_score(y_val, val_pred):.4f}")

#merging bureau data
bureau = pd.read_csv(r"C:\Users\Admin\Downloads\bureau.csv.zip")
bureau_agg = bureau.groupby('SK_ID_CURR').agg({
    'DAYS_CREDIT': ['min','max','mean'],
    'AMT_CREDIT_SUM': ['sum']
})
bureau_agg.columns = ['BURO_'+'_'.join(col).upper() for col in bureau_agg.columns]

train = train.merge(bureau_agg, on='SK_ID_CURR', how='left')
test = test.merge(bureau_agg, on='SK_ID_CURR', how='left')

#new features
features += bureau_agg.columns.tolist()

X = imputer.fit_transform(train[features])
X = scaler.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LGBMClassifier()
model.fit(X_train, y_train)
val_pred = model.predict_proba(X_val)[:,1]
print(f"External Data AUC: {roc_auc_score(y_val, val_pred):.4f}")

# Feature selection
final_features = [
    'EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',
    'AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY',
    'CREDIT_TO_ANNUITY_RATIO','EMPLOYED_TO_BIRTH_RATIO',
    'BURO_DAYS_CREDIT_MEAN','BURO_AMT_CREDIT_SUM_SUM'
]

X = imputer.fit_transform(train[final_features])
X = scaler.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier()
model.fit(X_train, y_train)
val_pred = model.predict_proba(X_val)[:,1]
print(f"XGBoost AUC: {roc_auc_score(y_val, val_pred):.4f}")

#submission
test_X = imputer.transform(test[final_features])
test_X = scaler.transform(test_X)
test_preds = model.predict_proba(test_X)[:,1]

submission = pd.DataFrame({
    'SK_ID_CURR': test['SK_ID_CURR'],
    'TARGET': test_preds
})
submission.to_csv('best_submission.csv', index=False)


