# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
##for modelling
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC,SVR

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
import os 
PATH = "/content/gdrive/MyDrive/Final Hack"
os.chdir(PATH)

In [None]:
d1 = pd.read_csv('Topuploan_train_test(Final).csv')

In [None]:
d2 = pd.read_csv('Asset_CustomerDemo.csv')

# **Reading Data**

In [None]:
print(d1.columns)
print(d2.columns)


In [None]:
d1.shape

In [None]:
d2.shape

In [None]:
# Merge the two data frames
merged_df1 = pd.merge(d1, d2, on='AssetID', how='outer')

In [None]:
merged_df1

# Data Visualization

In [None]:
 # Scatter plot with TotalWorkingYears against MonthlyIncome
plt.plot(merged_df1['AmountFinance'])
plt.plot(merged_df1['DisbursalAmount'])
 
# Adding Title to the Plot
plt.title("Scatter Plot")
 
# Setting the X and Y labels
plt.ylabel('DisbursalAmount')
plt.xlabel('AmountFinance')
 
plt.show()

In [None]:

# group the data by area and count the number of loans in each area
area_loan_counts = merged_df1.groupby("Area")["LoanStatus"].count().reset_index(name="Count of Loans")

# create a bar chart of the loan counts by area
plt.bar(area_loan_counts["Area"], area_loan_counts["Count of Loans"])
plt.xlabel("Area")
plt.ylabel("Number of Loans")
plt.title("Loan Distribution by Area")
plt.show()


In [None]:

# group the data by loan status and payment mode, and calculate the count of loans
grouped = merged_df1.groupby(["LoanStatus", "PaymentMode"]).size().reset_index(name="Count of Loans")

# pivot the table to create a matrix with loan status as rows and payment mode as columns
pivot_table = grouped.pivot(index="LoanStatus", columns="PaymentMode", values="Count of Loans").fillna(0)

# create a stacked bar chart
pivot_table.plot(kind="bar", stacked=True)
plt.xlabel("Loan Status")
plt.ylabel("Count of Loans")
plt.title("Loan Status and Payment Mode Distribution")
plt.show()


In [None]:

# group the data by loan status and calculate the count of loans for each status
grouped = merged_df1.groupby("LoanStatus").size().reset_index(name="Count of Loans")

#  to create a pie chart
plt.pie(grouped["Count of Loans"], labels=grouped["LoanStatus"], autopct="%1.1f%%")
plt.title("Loan Status Distribution")
plt.show()


In [None]:

#  to create a scatter plot
plt.scatter(merged_df1["LTV"], merged_df1["DisbursalAmount"])
plt.xlabel("LTV")
plt.ylabel("Disbursal Amount")
plt.title("Relationship between LTV and Disbursal Amount")
plt.show()

In [None]:
# select the numerical columns to include in the heatmap
numerical_cols = ['AmountFinance', 'DisbursalAmount', 'EMI', 'LTV', 'Tenure']

# to  create a correlation matrix
corr_matrix = merged_df1[numerical_cols].corr()

# to create a heatmap
sns.heatmap(corr_matrix, annot=True)
plt.title("Correlation between Numerical Variables")
plt.show()

In [None]:

# to create a histogram
plt.hist(merged_df1["DisbursalAmount"], bins=10)
plt.xlabel("Disbursal Amount")
plt.ylabel("Count")
plt.title("Distribution of Disbursal Amount")
plt.show()


In [None]:
# to create a joint plot
sns.jointplot(data=merged_df1, x="AssetCost", y="DisbursalAmount", kind="scatter")

In [None]:
sns.distplot(merged_df1["EMI"], kde=True, bins=20)


In [None]:

# to create a box plot
sns.boxplot(data=merged_df1, x="PaymentMode", y="AmountFinance")

# Exploratory Data Analysis

In [None]:
merged_df1.drop(['DisbursalDate','MaturityDAte','AuthDate'], axis=1, inplace=True)

In [None]:
df2 = merged_df1.rename(columns={'Top-up ?': 'topup'})

In [None]:
df2

In [None]:
df2.isnull().sum()

In [None]:
df = df2.dropna()

In [None]:
df.isnull().sum()

In [None]:
df

In [None]:
df.nunique()

In [None]:
df.drop(['Customer ID','AssetID'], axis=1, inplace=True)

In [None]:
df

In [None]:
df.describe()

In [None]:
Q1 = df.EMI.quantile(0.25)
Q3 = df.EMI.quantile(0.75)
IQR = Q3-Q1

lower = Q1-(1.5*IQR)
upper = Q3+(1.5*IQR)

In [None]:
print(lower)
print(upper)

In [None]:
a = df[(df.EMI>lower)&(df.EMI<upper)]

In [None]:
a.shape

In [None]:
a.describe()

In [None]:
a.dtypes

# Numeric and Categorical Attributes

In [None]:
num_cols = ['BranchID','Tenure','AmountFinance','DisbursalAmount','EMI','LTV','AssetCost','ManufacturerID','SupplierID']
cat_cols = ['Frequency','InstlmentMode','LoanStatus','PaymentMode']

In [None]:
a[cat_cols]= a[cat_cols].astype("category")

In [None]:
a.dtypes

In [None]:
a

In [None]:
a.topup.value_counts()

In [None]:
a.topup.value_counts(normalize = True)*100

# Data PreProcessing

In [None]:
X = a.drop(["topup"], axis=1)

In [None]:
y = a[["topup"]]

In [None]:
a.shape

In [None]:
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=123)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y_train.value_counts()

In [None]:
y_train.value_counts(normalize = True)*100

In [None]:
y_test.value_counts()

In [None]:
y_test.value_counts(normalize = True)*100

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
y_train['topup_enc'] = LabelEncoder().fit_transform(y_train['topup'])
y_train[['topup', 'topup_enc']]

In [None]:
y_train = y_train.drop(columns = ['topup'])
y_train

In [None]:
y_test['topup_enc'] = LabelEncoder().fit_transform(y_test['topup'])
y_test[['topup', 'topup_enc']]

In [None]:
y_test = y_test.drop(columns = ['topup'])
y_test

# One Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
cal_cols = X_train.select_dtypes(include=['category']).columns

In [None]:
enc = OneHotEncoder(drop = 'first')
enc.fit(X_train[cat_cols])

In [None]:
X_train_ohe=enc.transform(X_train[cat_cols]).toarray()
X_test_ohe=enc.transform(X_test[cat_cols]).toarray()

In [None]:
print(X_train_ohe.shape)
print(X_test_ohe.shape)

# Standardization of Data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

In [None]:
scaler = StandardScaler()
scaler.fit(X_train[num_cols])

In [None]:
X_train_std = scaler.transform(X_train[num_cols])
X_test_std = scaler.transform(X_test[num_cols])

In [None]:
print(X_train_std.shape)
print(X_test_std.shape)

# **Concatenate**

In [None]:
X_train_con = np.concatenate([X_train_std, X_train_ohe], axis=1)
X_test_con = np.concatenate([X_test_std, X_test_ohe], axis=1)

In [None]:
print(X_train_con.shape)
print(X_test_con.shape)

## **Model Building**

In [None]:
def evaluate_model(act, pred):
    from sklearn.metrics import confusion_matrix,classification_report, accuracy_score, recall_score, precision_score, f1_score
    print("Confusion Matrix \n", confusion_matrix(act, pred))
    print(classification_report(act,pred))
    print("Accurcay : ", accuracy_score(act, pred))
    print("Recall   : ", recall_score(act, pred,average='weighted'))
    print("Precision: ", precision_score(act, pred, average='weighted'))
    print("F1_score : ", f1_score(act, pred, average='weighted'))

Upsampling

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=123)
X_train_sm, y_train_sm = smote.fit_resample(X_train_con, y_train)

In [None]:
np.unique(y_train, return_counts= True)
np.unique(y_train_sm, return_counts= True)

## **Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
m1 = LogisticRegression()
m1.fit(X_train_con, y_train)

train_pred_lr = m1.predict(X_train_con)
test_pred_lr = m1.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train, train_pred_lr)
print("--Test--")
evaluate_model(y_test, test_pred_lr)

In [None]:
m2 = LogisticRegression(solver='saga',penalty='l2',C=1, max_iter=1000)
m2.fit(X_train_sm, y_train_sm)

train_pred_lr_hp = m2.predict(X_train_con)
test_pred_lr_hp = m2.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train, train_pred_lr_hp)
print("--Test--")
evaluate_model(y_test, test_pred_lr_hp)

In [None]:
m3 = LogisticRegression()
m3.fit(X_train_sm, y_train_sm)

train_pred_lr_sm = m3.predict(X_train_sm)
test_pred_lr_sm = m3.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train_sm, train_pred_lr_sm)
print("--Test--")
evaluate_model(y_test, test_pred_lr_sm)

## **KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
m4 = KNeighborsClassifier(n_neighbors=3)
m4.fit(X_train_con, y_train)

train_pred_knn = m4.predict(X_train_con)
test_pred_knn = m4.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train, train_pred_knn)
print("--Test--")
evaluate_model(y_test, test_pred_knn)

In [None]:
m5 = KNeighborsClassifier()
m5.fit(X_train_sm, y_train_sm)

train_pred_knn_sm = m5.predict(X_train_sm)
test_pred_knn_sm = m5.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train_sm, train_pred_knn_sm)
print("--Test--")
evaluate_model(y_test, test_pred_knn_sm)

In [None]:
m6 = KNeighborsClassifier(n_neighbors=30)
m6.fit(X_train_con, y_train)

train_pred_knn_hp = m6.predict(X_train_con)
test_pred_knn_hp = m6.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train, train_pred_knn_hp)
print("--Test--")
evaluate_model(y_test, test_pred_knn_hp)

## **RandomForest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
m7 = RandomForestClassifier()
m7.fit(X_train_con, y_train)

train_pred_rf = m7.predict(X_train_con)
test_pred_rf = m7.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train, train_pred_rf)
print("--Test--")
evaluate_model(y_test, test_pred_rf)

In [None]:
m8 = RandomForestClassifier()
m8.fit(X_train_sm, y_train_sm)

train_pred_rf_sm = m8.predict(X_train_sm)
test_pred_rf_sm = m8.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train_sm, train_pred_rf_sm)
print("--Test--")
evaluate_model(y_test, test_pred_rf_sm)

## **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
m10 = DecisionTreeClassifier()
m10 = m10.fit(X_train_con,y_train)

train_pred_dt = m10.predict(X_train_con)
test_pred_dt = m10.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train, train_pred_dt)
print("--Test--")
evaluate_model(y_test, test_pred_dt)

In [None]:
m11 = DecisionTreeClassifier()
m11.fit(X_train_sm,y_train_sm)

train_pred_dt_sm = m11.predict(X_train_sm)
test_pred_dt_sm = m11.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train_sm, train_pred_dt_sm)
print("--Test--")
evaluate_model(y_test, test_pred_dt_sm)

In [None]:
param_grid = {"criterion": ["gini","entropy"],
              "max_depth" : [15,20],
              "max_features" : [5,7,9,10],
              "min_samples_leaf" : [3,5,15,20],
             "min_samples_split":[2,4,6,8],
             "max_leaf_nodes":[4,8,10,15]}

In [None]:
m12 = DecisionTreeClassifier()
from sklearn.model_selection import GridSearchCV
m12 = GridSearchCV(m12,param_grid,cv=5)
m12.fit(X_train_con,y_train)

In [None]:
m12.best_params_

In [None]:
m12 = DecisionTreeClassifier(criterion="gini",max_depth=20,max_features=9,
                            max_leaf_nodes=15,min_samples_leaf=15,min_samples_split =6)
m12.fit(X_train_sm,y_train_sm)

train_pred_dt_hp = m12.predict(X_train_sm)
test_pred_dt_hp = m12.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train_sm, train_pred_dt_hp)
print("--Test--")
evaluate_model(y_test, test_pred_dt_hp)

# SVM

In [None]:
from sklearn import svm
m13 = svm.SVC(kernel='linear')
m13 = m13.fit(X_train_con,y_train)

train_pred_svm = m13.predict(X_train_con)
test_pred_svm = m13.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train, train_pred_svm)
print("--Test--")
evaluate_model(y_test, test_pred_svm)

In [None]:
m15 = svm.SVC(kernel='rbf', C=1)
m15.fit(X_train_sm, y_train_sm)

train_pred_svm2 = m15.predict(X_train_sm)
test_pred_svm2 = m15.predict(X_test_con)

In [None]:
print("--Train--")
evaluate_model(y_train_sm, train_pred_svm2)
print("--Test--")
evaluate_model(y_test, test_pred_svm2)

Comparision

In [None]:
performance_columns = ['Model name', 'Train accuracy', 'Train precision', 'Train recall','Train F1_score',
                       'Test accuracy', 'Test precision', 'Test recall','Test F1_score']
performance_comparison = pd.DataFrame(columns=performance_columns)

In [None]:
from numpy.lib.function_base import average
def add_to_perform_compare_df(df, model_name, train_actual, train_predict, test_actual, test_predict):
    
    from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
    
    train_accuracy = accuracy_score(train_actual, train_predict)
    test_accuracy = accuracy_score(test_actual, test_predict)
    
    train_recall = recall_score(train_actual, train_predict,average='weighted')
    test_recall = recall_score(test_actual,test_predict,average='weighted')
    
    train_precision = precision_score(train_actual, train_predict,average='weighted')
    test_precision = precision_score(test_actual, test_predict,average='weighted')

    train_f1 = f1_score(train_actual, train_predict,average='weighted')
    test_f1 = f1_score(test_actual, test_predict,average='weighted')
    
    df = df.append(pd.Series([model_name, train_accuracy, train_precision, train_recall, train_f1,
                              test_accuracy, test_precision, test_recall,test_f1],
                             index=df.columns),ignore_index=True)
    return df

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'Logistic Regression',
                                                   y_train, train_pred_lr, y_test, test_pred_lr)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'Logistic Regression_SM',
                                                   y_train_sm, train_pred_lr_sm, y_test, test_pred_lr_sm)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'Logistic Regression_HP',
                                                   y_train, train_pred_lr_hp, y_test, test_pred_lr_hp)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'KNN',
                                                   y_train, train_pred_knn, y_test, test_pred_knn)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'KNN_SM',
                                                   y_train_sm, train_pred_knn_sm, y_test, test_pred_knn_sm)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'KNN_HP',
                                                   y_train, train_pred_knn_hp, y_test, test_pred_knn_hp)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'Random Forest',
                                                   y_train, train_pred_rf, y_test, test_pred_rf)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'Random Forest_SM',
                                                   y_train_sm, train_pred_rf_sm, y_test, test_pred_rf_sm)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'Decision Tree',
                                                   y_train, train_pred_dt, y_test, test_pred_dt)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'Decision Tree_SM',
                                                   y_train_sm, train_pred_dt_sm, y_test, test_pred_dt_sm)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'Decision Tree_HP',
                                                   y_train_sm, train_pred_dt_hp, y_test, test_pred_dt_hp)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'SVM',
                                                   y_train, train_pred_svm, y_test, test_pred_svm)

In [None]:
performance_comparison = add_to_perform_compare_df(performance_comparison, 'SVM tune2',
                                                   y_train_sm, train_pred_svm2, y_test, test_pred_svm2)

In [None]:
performance_comparison

In [None]:
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

In [None]:
models = []
models.append(("LoR", LogisticRegression()) )
models.append(("KNN", KNeighborsClassifier()) )
models.append(("RF", RandomForestClassifier()) )
models.append(("DT", DecisionTreeClassifier()) )
models.append(("SVM", svm.SVC()))

In [None]:
results = []
names = []

for name, model in models:
    model.fit(X_train_con,y_train)
    y_pred=model.predict(X_test_con)
    predictions=[round(value) for value in y_pred]
    accuracy=accuracy_score(y_test,predictions,average='weighted')
    print('Accuracy: %2f%%' %(accuracy*100),name)

In [None]:
results = []
names = []

for name, model in models:
    model.fit(X_train_sm,y_train_sm)
    y_pred=model.predict(X_test_con)
    predictions=[round(value) for value in y_pred]
    recall=recall_score(y_test,predictions,average='weighted')
    print('Recall: %2f%%' %(recall*100),name)