In [None]:
import warnings

# Suppress the Deprecation Warnings.
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Load in the necessary libraries.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from IPython.display import display
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

In [None]:
# Load in the dataset.
df_train = pd.read_csv('Resources/train.csv')
df_val = pd.read_csv('Resources/valid.csv')
df_test = pd.read_csv('Resources/X_test.csv')

In [None]:
# Get a general overview of the dataset.
display(df_train.head())
display(df_train.shape)

## Handling Missing Values


In [None]:
# Drop the columns where more than 50% of the data is missing.
df_train.dropna(axis='columns', inplace=True, thresh=len(df_train)/2)

print(df_train.shape)

## Detecting and Removing Outliers


In [None]:
display(df_train.select_dtypes(include=['int64', 'float64']).columns)

In [None]:
# The business context should govern how we define and react to outliers.
# The meanings of our findings should be dictated by the underlying context, rather than the number itself.

sns.boxplot(x=df_train['int_rate'])

## Feature Scaling


In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

def scale_features(df):
    # Create a copy of the DataFrame to avoid modifying the original DataFrame.
    df_scaled = df.copy()

    # Lowercase categorical columns.
    categorical_columns = df.select_dtypes(include=['object']).columns
    df_scaled[categorical_columns] = df_scaled[categorical_columns].apply(lambda x: x.str.lower())

    # Encode categorical columns using LabelEncoder.
    label_encoders = {}
    for column in categorical_columns:
        label_encoders[column] = LabelEncoder()
        df_scaled[column] = label_encoders[column].fit_transform(df_scaled[column])

    # Scale numerical columns using StandardScaler, excluding the target column if it exists.
    scaler = MinMaxScaler()
    numerical_columns = df_scaled.select_dtypes(include=['int', 'float']).columns
    if 'loan_status' in numerical_columns:
        numerical_columns = numerical_columns.drop('loan_status')
    df_scaled[numerical_columns] = scaler.fit_transform(df_scaled[numerical_columns])

    return df_scaled

# Apply the preprocess_data function to each dataset
df_scaled = scale_features(df_train)
df_scaled_val = scale_features(df_val)
df_scaled_test = scale_features(df_test)


In [None]:
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Define the XGBoost classifier
xgb_classifier = XGBClassifier()

# Train the classifier on the entire dataset
xgb_classifier.fit(df_scaled.drop('loan_status', axis=1), df_scaled['loan_status'])

# Use feature importances to select top features
feature_importances = xgb_classifier.feature_importances_

# Sort feature importances in descending order and get corresponding indices
sorted_indices = np.argsort(feature_importances)[::-1]





# Select top k features based on importance scores
top_k = 4  # You can adjust this value as needed
selected_feature_indices = sorted_indices[:top_k]

# Get selected feature names
selected_features = df_scaled.drop('loan_status', axis=1).columns[selected_feature_indices]

print("Selected features:", selected_features)





# Subset the DataFrame with selected features for training
X_selected_train = df_scaled[selected_features]
y_train = df_scaled['loan_status']

# Subset the DataFrame with selected features for validation
X_selected_val = df_scaled_val[selected_features]
y_val = df_scaled_val['loan_status']

# Initialize XGBoost classifier with default hyperparameters
xgb_classifier = XGBClassifier()

# Train the classifier on the selected features using training data
xgb_classifier.fit(X_selected_train, y_train)

# Predict on the validation dataset
y_pred_val = xgb_classifier.predict(X_selected_val)

# Evaluate the classifier on the validation dataset
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val, end='\n\n\n\n')

# Print classification report for validation dataset
print("Validation Classification Report:")
print(classification_report(y_val, y_pred_val))





# # Now, predict the target variable for df_scaled_test
# # Subset the DataFrame with selected features for testing
# X_selected_test = df_scaled_test[selected_features]

# # Predict on the test dataset
# y_pred_test = xgb_classifier.predict(X_selected_test)

# # Create a new DataFrame with selected features and predicted target variable
# df_test_with_predictions = X_selected_test.copy()
# df_test_with_predictions['loan_status'] = y_pred_test  # Add predicted target variable to DataFrame

# # Rearrange columns to have predicted_loan_status as the first column
# df_test_with_predictions = df_test_with_predictions[['loan_status'] + list(X_selected_test.columns)]

# # Save the DataFrame to a new CSV file
# df_test_with_predictions.to_csv('210465P.csv', index=False)
