In [1]:
# first lets import all the libraries we will need 
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score,recall_score, f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer

# importing graphing libraries
import matplotlib.pyplot as plt
import seaborn as sns

# importing other things we will need
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#importing necessary models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')
import re


In [2]:
# load the dataset 
train = pd.read_csv(r'C:\Users\HP\Documents\PROJECTS/ACSC Train.csv')
test = pd.read_csv(r'C:\Users\HP\Documents\PROJECTS/ACSC Test.csv')
economic_data = pd.read_csv(r'C:\Users\HP\Documents\PROJECTS/economic_indicators.csv')

In [3]:
train.head()

Unnamed: 0,ID,customer_id,country_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,duration,New_versus_Repeat,Amount_Funded_By_Lender,Lender_portion_Funded,Lender_portion_to_be_repaid,target
0,ID_266671248032267278,266671,Kenya,248032,267278,Type_1,8448.0,8448.0,2022-08-30,2022-09-06,7,Repeat Loan,120.85,0.014305,121.0,0
1,ID_248919228515267278,248919,Kenya,228515,267278,Type_1,25895.0,25979.0,2022-07-30,2022-08-06,7,Repeat Loan,7768.5,0.3,7794.0,0
2,ID_308486370501251804,308486,Kenya,370501,251804,Type_7,6900.0,7142.0,2024-09-06,2024-09-13,7,Repeat Loan,1380.0,0.2,1428.0,0
3,ID_266004285009267278,266004,Kenya,285009,267278,Type_1,8958.0,9233.0,2022-10-20,2022-10-27,7,Repeat Loan,2687.4,0.3,2770.0,0
4,ID_253803305312267278,253803,Kenya,305312,267278,Type_1,4564.0,4728.0,2022-11-28,2022-12-05,7,Repeat Loan,1369.2,0.3,1418.0,0


In [4]:
# checking the train dataset shape
train.shape

(68654, 16)

In [5]:
train['loan_type'].unique()

array(['Type_1', 'Type_7', 'Type_5', 'Type_4', 'Type_6', 'Type_14',
       'Type_9', 'Type_10', 'Type_19', 'Type_16', 'Type_2', 'Type_11',
       'Type_23', 'Type_18', 'Type_17', 'Type_12', 'Type_20', 'Type_13',
       'Type_21', 'Type_15', 'Type_24', 'Type_22'], dtype=object)

In [6]:
b = ['Type_1', 'Type_7', 'Type_5', 'Type_4', 'Type_6', 'Type_14',
       'Type_9', 'Type_10', 'Type_19', 'Type_16', 'Type_2', 'Type_11',
       'Type_23', 'Type_18', 'Type_17', 'Type_12', 'Type_20', 'Type_13',
       'Type_21', 'Type_15', 'Type_24', 'Type_22']

In [7]:
a = np.array(train['loan_type'].value_counts())
a = a.reshape(-1,1)

In [8]:
defaulters_by_loan_type = train.groupby(['loan_type'])[['target']].sum()
c = defaulters_by_loan_type.loc[b]
c = np.array(c.values)

In [10]:
# Corrected DataFrame creation
defaultion_by_loan_type = pd.DataFrame(
    data=np.concatenate((a, c), axis=1), 
    columns=['Loans', 'Defaulters'],  
    index=train['loan_type'].unique()  
)

defaultion_by_loan_type


Unnamed: 0,Loans,Defaulters
Type_1,61723,520
Type_7,2790,235
Type_5,1521,122
Type_4,1235,152
Type_6,466,38
Type_14,357,71
Type_9,205,36
Type_10,99,12
Type_19,74,0
Type_16,42,0


In [None]:
economic_data.rename(columns={'Country': 'country_id'}, inplace=True)

In [None]:
def preprocess_and_merge(dataset, economic_data, country_filter):
    # Convert 'Disbursement Date' to datetime format and extract the year
    dataset['disbursement_date'] = pd.to_datetime(dataset['disbursement_date'], errors='coerce')
    dataset['Year'] = dataset['disbursement_date'].dt.year

    # Filter economic data for specified countries
    economic_filtered = economic_data[economic_data['country_id'].isin(country_filter)]

    # Pivot economic indicators to have years as columns
    economic_pivot = economic_filtered.pivot_table(
        index=['country_id', 'Indicator'], 
        columns='Year', 
        values='Value'
    ).reset_index()

    # Rename columns for clarity after pivoting
    economic_pivot = economic_pivot.rename(columns={2021: 'YR2021', 2022: 'YR2022', 2023: 'YR2023'})

    # Merge economic indicators with the dataset
    enriched_data = pd.merge(
        dataset, 
        economic_pivot, 
        how='left', 
        left_on=['country_id'], 
        right_on=['country_id']
    )

    # Handle missing values
    enriched_data[['YR2021', 'YR2022', 'YR2023']] = enriched_data[['YR2021', 'YR2022', 'YR2023']].fillna(
        enriched_data[['YR2021', 'YR2022', 'YR2023']].median()
    )

    # Select the required columns
    final_data = enriched_data[[
        'ID', 'customer_id', 'country_id', 'tbl_loan_id', 'lender_id', 'loan_type',
        'Total_Amount', 'Total_Amount_to_Repay', 'disbursement_date', 'due_date',
        'duration', 'New_versus_Repeat', 'Amount_Funded_By_Lender', 
        'Lender_portion_Funded', 'Lender_portion_to_be_repaid', 'target', 'Indicator',
        'YR2021', 'YR2022', 'YR2023'
    ]]
    
       # Return the final DataFrame
    return final_data


In [None]:
# Reshape the Economic Indicator Dataset
# Melt the economic data to long format
economic_long = economic_data.melt(
    id_vars=['country_id', 'Indicator'], 
    var_name='Year', 
    value_name='Value'
)


In [None]:
# Convert the 'Year' column to numeric format
economic_long['Year'] = economic_long['Year'].str.extract('(\\d+)').astype(int)

# Filter for relevant years (2021-2023)
economic_long_filtered = economic_long[economic_long['Year'].isin([2021, 2022, 2023])]

In [None]:
# Preprocess and merge for Train Dataset
train= preprocess_and_merge(
    train, 
    economic_long_filtered, 
    country_filter=['Kenya'] )

In [None]:
# a function to check for missing values
def missing_values(df):
    start_isna_sum = test.isna().sum().sort_values(ascending=False)
    print (start_isna_sum)

In [None]:
# checking for missing data in the train dataset
missing_values(train)

In [None]:
train.info()

In [None]:
corr_matrix = data.corr()
corr_matrix['target'].sort_values(ascending=False)

In [None]:
%matplotlib inline
data.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
# Drop irrelevant columns
train.drop(['ID', 'customer_id', 'tbl_loan_id', 'lender_id'], axis=1, inplace=True)

In [None]:
# Define target and features
y = train['target']
X = train.drop('target', axis=1)

In [None]:
# Create new features
X['Loan_Age'] = (pd.to_datetime(X['due_date']) - pd.to_datetime(X['disbursement_date'])).dt.days
X['Repayment_Ratio'] = X['Total_Amount_to_Repay'] / X['Total_Amount']
X['Funding_Gap'] = X['Total_Amount'] - X['Amount_Funded_By_Lender']

In [None]:
# Drop date columns after creating features
X.drop(['disbursement_date', 'due_date'], axis=1, inplace=True)

In [None]:
# Define categorical and numeric features
categorical_features = ['country_id', 'loan_type', 'New_versus_Repeat']
numeric_features = ['Total_Amount', 'Total_Amount_to_Repay', 'Loan_Age', 'Repayment_Ratio', 'Funding_Gap']

In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [None]:
# Complete pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the model
pipeline.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred = pipeline.predict(X_val)
print(classification_report(y_val, y_pred))