In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from scipy.stats import skew
from sklearn.base import BaseEstimator, TransformerMixin

In [21]:
df = pd.read_csv('train_v2.csv/train_v2.csv')

  df = pd.read_csv('train_v2.csv/train_v2.csv')


In [22]:
print(df.shape)
df.head()

(105471, 771)


Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f770,f771,f772,f773,f774,f775,f776,f777,f778,loss
0,1,126,10,0.686842,1100,3,13699,7201.0,4949.0,126.75,...,5,2.14,-1.54,1.18,0.1833,0.7873,1,0,5,0
1,2,121,10,0.782776,1100,3,84645,240.0,1625.0,123.52,...,6,0.54,-0.24,0.13,0.1926,-0.6787,1,0,5,0
2,3,126,10,0.50008,1100,3,83607,1800.0,1527.0,127.76,...,13,2.89,-1.73,1.04,0.2521,0.7258,1,0,5,0
3,4,134,10,0.439874,1100,3,82642,7542.0,1730.0,132.94,...,4,1.29,-0.89,0.66,0.2498,0.7119,1,0,5,0
4,5,109,9,0.502749,2900,4,79124,89.0,491.0,122.72,...,26,6.11,-3.82,2.51,0.2282,-0.5399,0,0,5,0


In [24]:
# Separate features and target
X = df.drop(columns=['loss'])  # Features
y = df['loss']  # Target variable

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Function to Convert Numeric Strings in Object Columns
def convert_numeric_objects(X):
    X = X.copy()
    object_cols = X.select_dtypes(include=['object']).columns
    X[object_cols] = X[object_cols].apply(pd.to_numeric, errors='coerce').astype(float)
    return X

In [26]:
# 🔥 Custom Transformer for Outlier Removal (IQR Method)
class OutlierRemover(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        Q1 = X.quantile(0.25)
        Q3 = X.quantile(0.75)
        self.iqr_bounds = (Q1 - 1.5 * (Q3 - Q1), Q3 + 1.5 * (Q3 - Q1))
        return self

    def transform(self, X):
        X_cleaned = X.copy()
        for col in X_cleaned.columns:
            lower, upper = self.iqr_bounds[0][col], self.iqr_bounds[1][col]
            X_cleaned[col] = np.where((X_cleaned[col] < lower) | (X_cleaned[col] > upper), np.nan, X_cleaned[col])
        return X_cleaned  # Return data with outliers replaced as NaN for imputation

In [27]:
# Define the transformation steps
num_transformer = Pipeline([
    ('convert_numeric_objects', FunctionTransformer(convert_numeric_objects, validate=False)),  # Convert object columns to numeric
    ('outlier_remover', OutlierRemover()),  # Outlier removal
    ('imputer', SimpleImputer(strategy='median')),   # Fill missing values
    ('scaler', StandardScaler()),                   # Scale numerical data
    ('power', PowerTransformer()),                   # Yeo-Johnson transform
])

In [28]:
# Apply transformations only to numerical columns
preprocessor = ColumnTransformer([
    ('num', num_transformer, X_train.columns)  # Apply the pipeline only on numerical columns
], remainder='passthrough')  # Keep other columns as they are

In [29]:
# Full Pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('pca', PCA(n_components=650))  # Adjust as needed
])

In [30]:
# Fit on X_train & Transform X_train/X_test
X_train_transformed = pipeline.fit_transform(X_train)  # Fit only on training data
X_test_transformed = pipeline.transform(X_test)  # Transform test data without refitting

# Convert back to DataFrame
X_train_transformed = pd.DataFrame(X_train_transformed)
X_test_transformed = pd.DataFrame(X_test_transformed)

In [11]:
# Train model
model = LinearRegression()
model.fit(X_train_transformed, y_train)

# Predict on test data
y_pred = model.predict(X_test_transformed)

# Compute loss (Mean Squared Error)
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute Error (Loss):", mae)

cv_scores = cross_val_score(model, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Average MAE:", -cv_scores.mean())

Mean absolute Error (Loss): 1.4646469589413658
Average MAE: 1.4710260413730405


In [12]:
# Train model
model = Ridge(alpha=1.0, random_state=42)
model.fit(X_train_transformed, y_train)

# Predict on test data
y_pred = model.predict(X_test_transformed)

# Compute loss (Mean Squared Error)
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute Error (Loss):", mae)

cv_scores = cross_val_score(model, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Average MAE:", -cv_scores.mean())

Mean absolute Error (Loss): 1.4645556264370474
Average MAE: 1.4708903391060104


In [13]:
# Train model
model = DecisionTreeRegressor(max_depth=10, random_state=42)
model.fit(X_train_transformed, y_train)

# Predict on test data
y_pred = model.predict(X_test_transformed)

# Compute loss (Mean Squared Error)
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute Error (Loss):", mae)

cv_scores = cross_val_score(model, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Average MAE:", -cv_scores.mean())

Mean absolute Error (Loss): 1.2047485964636773
Average MAE: 1.2302356174211073


In [14]:
# Train model
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train_transformed, y_train)

# Predict on test data
y_pred = model.predict(X_test_transformed)

# Compute loss (Mean Squared Error)
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute Error (Loss):", mae)

cv_scores = cross_val_score(model, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Average MAE:", -cv_scores.mean())

Mean absolute Error (Loss): 1.1931848056517738
Average MAE: 1.1977188390840536


In [15]:
# Train model
model = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, random_state=42)
model.fit(X_train_transformed, y_train)

# Predict on test data
y_pred = model.predict(X_test_transformed)

# Compute loss (Mean Squared Error)
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute Error (Loss):", mae)

cv_scores = cross_val_score(model, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Average MAE:", -cv_scores.mean())

Mean absolute Error (Loss): 1.1639956285275368
Average MAE: 1.187083307538877


In [16]:
# Train model
model = XGBRegressor(n_estimators=150, random_state=42)
model.fit(X_train_transformed, y_train)

# Predict on test data
y_pred = model.predict(X_test_transformed)

# Compute loss (Mean Squared Error)
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute Error (Loss):", mae)

cv_scores = cross_val_score(model, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Average MAE:", -cv_scores.mean())

Mean absolute Error (Loss): 1.364119291305542
Average MAE: 1.3853420257568358


In [31]:
# Train model
model = LGBMRegressor(n_estimators=100, random_state=42)
model.fit(X_train_transformed, y_train)

# Predict on test data
y_pred = model.predict(X_test_transformed)

# Compute loss (Mean Squared Error)
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute Error (Loss):", mae)

cv_scores = cross_val_score(model, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Average MAE:", -cv_scores.mean())

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.679643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 165750
[LightGBM] [Info] Number of data points in the train set: 84376, number of used features: 650
[LightGBM] [Info] Start training from score 0.798983
Mean absolute Error (Loss): 1.1318150631204968
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.655593 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 165750
[LightGBM] [Info] Number of data points in the train set: 67500, number of used features: 650
[LightGBM] [Info] Start training from score 0.801733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.643785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 165750
[LightGBM] [Info] Number of data points in the train set: 67501, 

In [35]:
# Full Pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('pca', PCA(n_components=650)),  # Adjust as needed
    ('model', LGBMRegressor(n_estimators=100, random_state=42))  # Model inside pipeline
])

In [36]:
# Separate features and target
X = df.drop(columns=['loss'])  # Features
y = df['loss']  # Target variable

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
# Train Pipeline
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.281016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 165750
[LightGBM] [Info] Number of data points in the train set: 84376, number of used features: 650
[LightGBM] [Info] Start training from score 0.798983


In [39]:
# Save Pipeline
joblib.dump(pipeline, "loan_default_predictor.pkl")

['loan_default_predictor.pkl']

In [58]:
# Check Accuracy
y_pred = pipeline.predict(X_test)

# Compute loss (Mean Squared Error)
print("Mean absolute Error (Loss):", mean_absolute_error(y_test, y_pred))

Mean absolute Error (Loss): 1.1318150631204968




In [60]:
# Load Saved Pipeline
pipeline = joblib.load("loan_default_predictor.pkl")

# Load New Data
test = pd.read_csv("test_v2.csv/test_v2.csv")  # Replace with your test file

# Predict
predictions = pipeline.predict(test)

# Save Only PassengerId & Predicted Survived
output = test[['id']].copy()  # Keep PassengerId
output['loss'] = predictions  # Add predictions

# Save to CSV
output.to_csv("predictions.csv", index=False)

print("Predictions saved in predictions.csv with only PassengerId & Survived!")

  test = pd.read_csv("test_v2.csv/test_v2.csv")  # Replace with your test file


Predictions saved in predictions.csv with only PassengerId & Survived!
