In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
#Import data
path = "/Users/lilyhuang/Desktop/loan_level_500k.csv"
df = pd.read_csv(path)

In [7]:
# Replace missing values with the median
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='median')
df[numeric_features] = imputer.fit_transform(df[numeric_features])

In [8]:
#Feature Selection
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [9]:
# Perform Backward Elimination
import statsmodels.api as sm

X = df[numeric_features]
y = df['DELINQUENT']

# Add a constant column to the feature matrix
X = sm.add_constant(X)

# Create a model using OLS (Ordinary Least Squares)
model = sm.OLS(y, X)

# Fit the model
results = model.fit()

# Perform backward elimination until there are exactly 6 features left
while len(X.columns) > 7:  # Including the constant column
    # Get the p-values for each feature
    p_values = results.pvalues[1:]  # Exclude the constant column

    # Identify the feature with the highest p-value
    feature_to_remove = p_values.idxmax()

    if p_values[feature_to_remove] > 0.05 or len(X.columns) > 7:
        # Remove the feature if its p-value is greater than the significance level (e.g., 0.05)
        X = X.drop(feature_to_remove, axis=1)
        # Update the model
        model = sm.OLS(y, X)
        results = model.fit()
    else:
        break

# Print the remaining features after backward elimination
selected_features = X.columns[1:]  # Exclude the constant column
print("Selected Features:")
print(selected_features)

Selected Features:
Index(['CREDIT_SCORE', 'MORTGAGE_INSURANCE_PERCENTAGE', 'ORIGINAL_UPB',
       'ORIGINAL_LOAN_TO_VALUE', 'ORIGINAL_INTEREST_RATE',
       'NUMBER_OF_BORROWERS'],
      dtype='object')


In [10]:
#Forward Feature Selection
X = df[numeric_features]
y = df['DELINQUENT']

# Initialize an empty list to store selected features
selected_features = []

# Add a constant column (intercept) for the regression
X_with_constant = sm.add_constant(X)

# Start forward feature selection process
while len(selected_features) < 6:  # Stop when 6 features are selected
    remaining_features = [col for col in X.columns if col not in selected_features]
    
    # Create an empty dictionary to store p-values for the current iteration
    p_values = {}

    # Loop through remaining features and evaluate them
    for feature in remaining_features:
        # Fit model with the selected features plus the current feature
        X_selected = X_with_constant[selected_features + [feature]]
        model = sm.OLS(y, X_selected).fit()
        
        # Record the p-value for the current feature
        p_values[feature] = model.pvalues[feature]
    
    # Find the feature with the lowest p-value
    best_feature = min(p_values, key=p_values.get)
    
    # Add this feature to the selected features list
    selected_features.append(best_feature)

# Final model with selected features
X_selected = X_with_constant[selected_features]
final_model = sm.OLS(y, X_selected).fit()

# Print the selected features
print("Selected Features:")
print(selected_features)

Selected Features:
['CREDIT_SCORE', 'FIRST_PAYMENT_DATE', 'MORTGAGE_INSURANCE_PERCENTAGE', 'ORIGINAL_UPB', 'NUMBER_OF_BORROWERS', 'ORIGINAL_INTEREST_RATE']


In [13]:
#LASSO
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("/Users/lilyhuang/Desktop/loan_level_500k.csv").head(1000)

# Select only numeric features
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns
X = df[numeric_features]
y = df['DELINQUENT']

# Replace missing values with the median
X = X.fillna(X.median())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and fit the LASSO model
lasso = Lasso(alpha=0.01, random_state=42)
lasso.fit(X_train_scaled, y_train)

# Calculate feature importance
feature_importance = pd.Series(abs(lasso.coef_), index=X.columns)
feature_importance = feature_importance.sort_values(ascending=False)

# Select top 6 features
top_6_features = feature_importance.head(6)

print("Top 6 Features Selected by LASSO:")
for feature, importance in top_6_features.items():
    print(f"{feature}: {importance:.6f}")

# Create a new dataframe with only the selected features
X_selected = X[top_6_features.index]

print("\nShape of the new dataset with selected features:")
print(X_selected.shape)

print("\nSample of the new dataset:")
print(X_selected.head())

Top 6 Features Selected by LASSO:
CREDIT_SCORE: 0.037052
NUMBER_OF_BORROWERS: 0.018592
MORTGAGE_INSURANCE_PERCENTAGE: 0.011994
ORIGINAL_INTEREST_RATE: 0.005072
ORIGINAL_LOAN_TO_VALUE: 0.004976
ORIGINAL_UPB: 0.004420

Shape of the new dataset with selected features:
(1000, 6)

Sample of the new dataset:
   CREDIT_SCORE  NUMBER_OF_BORROWERS  MORTGAGE_INSURANCE_PERCENTAGE  \
0         669.0                  2.0                            0.0   
1         732.0                  1.0                            0.0   
2         679.0                  1.0                           30.0   
3         721.0                  2.0                            0.0   
4         618.0                  2.0                           25.0   

   ORIGINAL_INTEREST_RATE  ORIGINAL_LOAN_TO_VALUE  ORIGINAL_UPB  
0                   7.120                    80.0        162000  
1                   6.500                    25.0         53000  
2                   6.750                    91.0        133000  
3    