In [None]:
import warnings

# Suppress the Deprecation Warnings.
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Load in the necessary libraries.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Load in the dataset.
df_train = pd.read_csv('Resources/train.csv')
df_val = pd.read_csv('Resources/valid.csv')
df_test = pd.read_csv('Resources/X_test.csv')

In [None]:
# Get a general overview of the dataset.
print(df_train.head(), end='\n\n')
print(df_train.shape)

## Data Cleaning


In [None]:
# Drop the columns where more than 50% of the data is missing.
df_train.dropna(axis='columns', inplace=True, thresh=len(df_train)/2)

print(df_train.shape)

## Removing Outliers


## Feature Scaling

- If features are not Gaussian-like, say, has a skewed distribution or has outliers, Normalization - Standardization is not a good choice as it will compress most data to a narrow range.

- However, we can transform the feature into Gaussian like and then use Normalization - Standardization.

- When performing distance or covariance calculation (algorithm like Clustering, PCA and LDA), it is better to use Normalization - Standardization as it will remove the effect of scales on variance and covariance.

- Min-Max scaling has the same drawbacks as Normalization - Standardization, and also new data may not be bounded to [0,1] as they can be out of the original range. Some algorithms, for example some deep learning network prefer input on a 0-1 scale so this is a good choice.


In [None]:
# plt.figure(figsize=(8, 6))

# sns.kdeplot(df_train['loan_amnt'], color='skyblue', fill=True)

# plt.xlabel('Loan Amount')
# plt.ylabel('Frequency')

# plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# le = LabelEncoder()
# df['term'] = le.fit_transform(df['term'])
# Create a copy of the DataFrame to avoid modifying the original DataFrame
df_scaled = df_train.copy()

# Encode categorical columns using LabelEncoder
label_encoders = {}
for column in df_train.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df_scaled[column] = label_encoders[column].fit_transform(df_train[column])

# Scale numerical columns using MinMaxScaler
scaler = MinMaxScaler()
numerical_columns = df_train.select_dtypes(include=['int', 'float']).columns
df_scaled[numerical_columns] = scaler.fit_transform(df_train[numerical_columns])

# Display the scaled DataFrame
print(df_scaled.head())


# le = LabelEncoder()
# df['term'] = le.fit_transform(df['term'])
# Create a copy of the DataFrame to avoid modifying the original DataFrame
df_scaled_val = df_val.copy()

# Encode categorical columns using LabelEncoder
label_encoders = {}
for column in df_val.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df_scaled_val[column] = label_encoders[column].fit_transform(df_val[column])

# Scale numerical columns using MinMaxScaler
numerical_columns = df_val.select_dtypes(include=['int', 'float']).columns
df_scaled_val[numerical_columns] = scaler.fit_transform(df_val[numerical_columns])

# Display the scaled DataFrame
print(df_scaled_val.head())


# le = LabelEncoder()
# df['term'] = le.fit_transform(df['term'])
# Create a copy of the DataFrame to avoid modifying the original DataFrame
df_scaled_test = df_test.copy()

# Encode categorical columns using LabelEncoder
label_encoders = {}
for column in df_test.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df_scaled_test[column] = label_encoders[column].fit_transform(df_test[column])

# Scale numerical columns using MinMaxScaler
numerical_columns = df_test.select_dtypes(include=['int', 'float']).columns
df_scaled_test[numerical_columns] = scaler.fit_transform(df_test[numerical_columns])

# Display the scaled DataFrame
print(df_scaled_test.head())

In [None]:
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

# Define the XGBoost classifier
xgb_classifier = XGBClassifier()

# Train the classifier on the entire dataset
xgb_classifier.fit(df_scaled.drop('loan_status', axis=1), df_scaled['loan_status'])

# Use feature importances to select top features
feature_importances = xgb_classifier.feature_importances_

# Sort feature importances in descending order and get corresponding indices
sorted_indices = np.argsort(feature_importances)[::-1]

# Select top k features based on importance scores
top_k = 5  # You can adjust this value as needed
selected_feature_indices = sorted_indices[:top_k]

# Get selected feature names
selected_features = df_scaled.drop('loan_status', axis=1).columns[selected_feature_indices]

print("Selected features:", selected_features)

In [None]:
# # Get the indices of selected features
# selected_indices = selector.get_support()

# # Get the names of selected features
# selected_features = df_scaled.drop('loan_status', axis=1).columns[selected_indices]

# # Print the names of selected features
# print("Selected Features:")
# print(selected_features)

In [None]:
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Define the XGBoost classifier
xgb_classifier = XGBClassifier()

# Train the classifier on the entire dataset
xgb_classifier.fit(df_scaled.drop('loan_status', axis=1), df_scaled['loan_status'])

# Use feature importances to select top features
feature_importances = xgb_classifier.feature_importances_

# Sort feature importances in descending order and get corresponding indices
sorted_indices = np.argsort(feature_importances)[::-1]





# Select top k features based on importance scores
top_k = 4  # You can adjust this value as needed
selected_feature_indices = sorted_indices[:top_k]

# Get selected feature names
selected_features = df_scaled.drop('loan_status', axis=1).columns[selected_feature_indices]

print("Selected features:", selected_features)





# Subset the DataFrame with selected features for training
X_selected_train = df_scaled[selected_features]
y_train = df_scaled['loan_status']

# Subset the DataFrame with selected features for validation
X_selected_val = df_scaled_val[selected_features]
y_val = df_scaled_val['loan_status']

# Initialize XGBoost classifier with default hyperparameters
xgb_classifier = XGBClassifier()

# Train the classifier on the selected features using training data
xgb_classifier.fit(X_selected_train, y_train)

# Predict on the validation dataset
y_pred_val = xgb_classifier.predict(X_selected_val)

# Evaluate the classifier on the validation dataset
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val, end='\n\n\n\n')

# Print classification report for validation dataset
print("Validation Classification Report:")
print(classification_report(y_val, y_pred_val))





# Now, predict the target variable for df_scaled_test
# Subset the DataFrame with selected features for testing
X_selected_test = df_scaled_test[selected_features]

# Predict on the test dataset
y_pred_test = xgb_classifier.predict(X_selected_test)

# Create a new DataFrame with selected features and predicted target variable
df_test_with_predictions = X_selected_test.copy()
df_test_with_predictions['loan_status'] = y_pred_test  # Add predicted target variable to DataFrame

# Rearrange columns to have predicted_loan_status as the first column
df_test_with_predictions = df_test_with_predictions[['loan_status'] + list(X_selected_test.columns)]

# Save the DataFrame to a new CSV file
df_test_with_predictions.to_csv('210465P.csv', index=False)
