In [None]:
import warnings

# Suppress the Deprecation Warnings.
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Load in the necessary libraries.
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Set the display option to show all rows without truncation.
pd.set_option('display.max_rows', None)

In [None]:
# Read in the data.
df = pd.read_csv('Resources/train.csv')
# valid_df = pd.read_csv('Resources/valid.csv')

In [None]:
df.shape
row_count = len(df)

In [None]:
# Set seed for reproducibility.
np.random.seed(42)

In [None]:
# Observe the first five rows of the dataset.
df.head(5)

In [None]:
# Observe the last five rows of the dataset.
df.tail(5)

In [None]:
# Get the number of missing data points per column.
missing_values_count = df.isnull().sum()
missing_values_count

In [None]:
# How many total missing values do we have?
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

# Percent of data that is missing.
percent_missing = (total_missing / total_cells) * 100
# total_missing
percent_missing

In [None]:
# Drop the columns where more than 50% of elements are missing.
df.dropna(axis='columns', inplace=True, thresh=row_count/2)
df.shape

In [None]:
# Get the number of missing data points per column.
missing_values_count = df.isnull().sum()
missing_values_count

In [None]:
# Examine the columns with missing values.
df.columns[df.isnull().any().tolist()]

In [None]:
# Replace missing values.
# df.fillna(value={'emp_title': 'unemployed'}, inplace=True)
# df.fillna(value={'emp_length': '< 1 year'}, inplace=True)
# df.fillna(value={'title': 'Not provided'}, inplace=True)
df.dropna(axis='rows', inplace=True) # , subset=['tot_coll_amt']

# df.loc[:, ['tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 'mo_sin_old_rev_tl_op', 'loan_status']].head(1000)
# df[df['revol_util'].isnull()][['revol_util', 'loan_status']]

In [None]:
pd.set_option('display.max_columns', None)
df.describe(include='all')

In [None]:
# le = LabelEncoder()
# df['term'] = le.fit_transform(df['term'])
# Create a copy of the DataFrame to avoid modifying the original DataFrame
df_scaled = df.copy()

# Encode categorical columns using LabelEncoder
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df_scaled[column] = label_encoders[column].fit_transform(df[column])

# Scale numerical columns using MinMaxScaler
scaler = MinMaxScaler()
numerical_columns = df.select_dtypes(include=['int', 'float']).columns
df_scaled[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Display the scaled DataFrame
print(df_scaled.head())

In [None]:
df['term'].head()

In [None]:
# Perform feature selection (e.g., using SelectKBest)
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(df_scaled.drop('loan_status', axis=1), df_scaled['loan_status'])

# Perform feature engineering (e.g., polynomial features)
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2)
X_poly = poly_features.fit_transform(X_selected)
X_selected

# Now you can proceed with model training using X_poly and df_scaled['target_column']

In [None]:
# Get the indices of selected features
selected_indices = selector.get_support()

# Get the names of selected features
selected_features = df_scaled.drop('loan_status', axis=1).columns[selected_indices]

# Print the names of selected features
print("Selected Features:")
print(selected_features)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assume df_scaled contains the processed DataFrame with features and target_column
# Split the data into features (X) and target (y)
X = df_scaled.drop('loan_status', axis=1)
y = df_scaled['loan_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Train the classifier
xgb_classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = xgb_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Optionally, you can tune hyperparameters using techniques like grid search or random search
# and perform cross-validation to fine-tune the model

Accuracy: 0.9993938914802224
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     11483
         1.0       1.00      1.00      1.00     26464

    accuracy                           1.00     37947
   macro avg       1.00      1.00      1.00     37947
weighted avg       1.00      1.00      1.00     37947

