In [24]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sys
sys.path.append('../src')  # Ensure src directory is added to the system path

# Import custom preprocessing functions
from data_preprocessing import load_data, handle_missing_values
from feature_engineering import encode_categorical_variables, add_total_income_feature
from model_training import train_model
from predictions import make_predictions, save_predictions
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# Load the preprocessed data
train_data = pd.read_csv('../data/train_cleaned.csv')
test_data = pd.read_csv('../data/test_cleaned.csv')

# Combine train and test data for consistent encoding
combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

# Feature Engineering: Add Total Income feature
combined_data = add_total_income_feature(combined_data)

# Encode categorical variables using LabelEncoder
combined_data, label_encoders = encode_categorical_variables(combined_data)

# Further processing, model training, and prediction code follows...

# Split the combined data back into train and test sets
train_data = combined_data.iloc[:train_data.shape[0], :]
test_data = combined_data.iloc[train_data.shape[0]:, :]

# Separate into features (X) and target variable (y)
X = train_data.drop(columns=['Loan_ID', 'Loan_Status'])  # Exclude non-numeric and target columns
y = train_data['Loan_Status']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling (only scale numeric columns)
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()

# Handle 'Dependents' column properly to avoid SettingWithCopyWarning
X_train['Dependents'] = X_train['Dependents'].replace('3+', 3).astype(int)
X_val['Dependents'] = X_val['Dependents'].replace('3+', 3).astype(int)
test_data.loc[:, 'Dependents'] = test_data['Dependents'].replace('3+', 3).astype(int)

# Scale numeric columns
X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()
test_data_scaled = test_data.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_val_scaled[numeric_cols] = scaler.transform(X_val[numeric_cols])
test_data_scaled[numeric_cols] = scaler.transform(test_data[numeric_cols])

# Model Building: Train the Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_val_scaled)
accuracy = accuracy_score(y_val, y_pred)
confusion = confusion_matrix(y_val, y_pred)
classification_rep = classification_report(y_val, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{confusion}')
print(f'Classification Report:\n{classification_rep}')

# Save the trained model
joblib.dump(model, '../model/random_forest_model.pkl')



# Make predictions on the test set
test_predictions = make_predictions(model, test_data_scaled.drop(columns=['Loan_ID', 'Loan_Status']))

# Prepare the submission file
save_predictions(test_predictions, test_data, filename='../data/loan_predictions.csv')

# At the end of encode_categorical_variables function
joblib.dump(label_encoders, '../model/label_encoders.pkl')


Accuracy: 0.7723577235772358
Confusion Matrix:
[[18 25]
 [ 3 77]]
Classification Report:
              precision    recall  f1-score   support

           N       0.86      0.42      0.56        43
           Y       0.75      0.96      0.85        80

    accuracy                           0.77       123
   macro avg       0.81      0.69      0.70       123
weighted avg       0.79      0.77      0.75       123



['../model/label_encoders.pkl']

In [15]:
import joblib
import os
from sklearn.ensemble import RandomForestClassifier

# Assuming `model` is your trained model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Save the trained model
model_path = '../model/random_forest_model.pkl'
joblib.dump(model, model_path)

# Verify if the model is saved correctly
if not os.path.exists(model_path):
    raise Exception(f"Model not saved at {model_path}")
else:
    print(f"Model saved successfully at {model_path}")


Model saved successfully at ../model/random_forest_model.pkl


In [17]:
import joblib
import os

# Save the trained model
model_path = '../model/random_forest_model.pkl'
joblib.dump(model, model_path)

# Save the label encoders
encoders_path = '../model/label_encoders.pkl'
joblib.dump(label_encoders, encoders_path)

# Verify if the files are saved correctly
if not os.path.exists(model_path):
    raise Exception(f"Model not saved at {model_path}")
else:
    print(f"Model saved successfully at {model_path}")

if not os.path.exists(encoders_path):
    raise Exception(f"Label encoders not saved at {encoders_path}")
else:
    print(f"Label encoders saved successfully at {encoders_path}")


Model saved successfully at ../model/random_forest_model.pkl
Label encoders saved successfully at ../model/label_encoders.pkl


In [21]:
import os

model_path = '../model/random_forest_model.pkl'
encoders_path = '../model/label_encoders.pkl'

print(f"Model file size: {os.path.getsize(model_path)} bytes")
print(f"Encoders file size: {os.path.getsize(encoders_path)} bytes")


Model file size: 1556201 bytes
Encoders file size: 1737 bytes
