# Baseline Logistic Regression Model
This notebook builds a baseline Logistic Regression model on the synthetic dataset for credit scoring of thin-file consumers.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load dataset
df = pd.read_csv('../data/synthetic-dataset/500Credit_Score_Dataset.csv')
df.head()


In [None]:
# Drop Profile_ID if present
if 'Profile_ID' in df.columns:
    df = df.drop(columns=['Profile_ID'])

# Encode categorical columns
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

df.head()


In [None]:
# Define features and label
X = df.drop(columns=['Credit_Score']) if 'Credit_Score' in df.columns else df.iloc[:, :-1]
y = df['Credit_Score'] if 'Credit_Score' in df.columns else df.iloc[:, -1]

# Split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Fit logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [None]:
# Predict and evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
