In [None]:

# # Credit Card Application

# To create a model that forecasts the propensity (probability) of customers responding to a personal loan campaign, we will utilize logistic regression. The outcomes will be categorized and the factors influencing the answer will be found using the model's probability. Building a model that identifies clients who are most likely to accept the loan offer in upcoming personal loan campaigns is the objective.

# ### 1) Importing required libraries

# In[1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import joblib
import itertools
import subprocess
from time import time
from scipy import stats
import scipy.optimize as opt  
from scipy.stats import chi2_contingency
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve

: 

In [None]:

# ### 2) Importing and Descriptive Stats
# 
# To market their loan products to people who already have deposit accounts, BankABC wants to create a direct marketing channel. To cross-sell personal loans to its current clients, the bank ran a test campaign. An enticing personal loan offer and processing charge waiver were aimed at a random group of 20000 clients. The targeted clients' information has been provided, together with information on how they responded to the marketing offer.

# In[2]:


# READ DATA
data = pd.read_excel("Approval.xlsx") 
data.shape  

data.head()

In [None]:
# In[3]:


# GETTING THE DIMENSIONS OF THE ARRAY
data.shape

In [None]:
# In[4]:


# VERIFYING IF WE IMPORTED THE RIGHT DATASET BY CHECKING THE FIRST XXX ENTRIES OF THE DATA
data.head(10)



In [None]:
# In[5]:


# VERIFYING IF WE IMPORTED THE RIGHT DATASET BY CHECKING THE LAST FIVE ENTRIES OF THE DATA
data.tail()

In [None]:
# In[6]:


# DESCRIPTIVE STATS
data.info()


In [None]:
# In[7]:


data.describe()


In [None]:
# ## 3) Handling Missing Values

# In[8]:


#checking for null values
data.isnull().sum()

In [None]:
# In[9]:


#IMPUTE MISSING VALUES
# For non numeric data using mode
for val in data:
    # Check if the column is of object type
    if data[val].dtypes == 'object':
        # Impute with the most frequent value
        data = data.fillna(data[val].value_counts().index[0])
        
#for numeric data using mean
for val in data:
    # Check if the column is of numeric type (integer or floating-point)
    if data[val].dtypes in [np.int64, np.float64]:
        # Find the mean value
        mean_value = data[val].mean()
        # Replace missing values with the mean
        data[val].fillna(mean_value, inplace=True)

In [None]:
# In[10]:


data.head(10)


In [None]:
# In[11]:


#Converting all non-numeric data to numeric - using one hot encoding
from sklearn.preprocessing import LabelEncoder
# Instantiate LabelEncoder
le = LabelEncoder()

for val in data:
    # Compare if the dtype is object
    if data[val].dtypes=='object':
        data[val]=le.fit_transform(data[val])


In [None]:
# In[12]:


data.head(10)


In [None]:
# In[13]:


# PLOTTING HISTOGRAMS FOR ALL VARIABLES

fig, axes = plt.subplots(nrows=6, ncols=3, figsize=(18, 18))
axes = axes.flatten()

for i, val in enumerate(data.columns):
    if data[val].dtypes in [np.int64, np.float64]:
        data[val].hist(ax=axes[i], bins=30)
        axes[i].set_title(val)
plt.tight_layout()
plt.show()




In [None]:
# In[14]:


# CREATING A COPY TO RETAIN THE NUMERICAL INFORMATION OF THE DATA AS WE CHANGE THE 0 AND 1 TO
# 'DID NOT RESPOND' AND 'RESPONDED'

###
data1 = data.copy()

In [None]:
# In[15]:


# RENAMING THE 0 AND 1 TO
# 'DID NOT RESPOND' AND 'RESPONDED' RESPECTIVELY
#data1.loc[:, 'admitted'] = data.loc[:, 'admitted'].apply(lambda x: 'admitted' if x == 1 else 'not admitted')


In [None]:
# In[16]:


data.head(20)

In [None]:
# In[17]:


# # calculate the correlation matrix
corr = data.corr()

# plot the heatmap
fig = plt.figure(figsize=(5,4))
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
            linewidths=.75)


In [None]:
# In[18]:


from sklearn.model_selection import train_test_split
data = data.drop(['DriversLicense','ZipCode'], axis=1)
data = data.values


In [None]:
# In[19]:


# Segregate features and labels into separate variables
X,y = data[:,0:13] , data[:,13]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=42)


In [None]:
# In[20]:


from sklearn.preprocessing import MinMaxScaler
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)


In [None]:
# In[21]:


from sklearn.linear_model import LogisticRegression
# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit model to the train set
logreg.fit(rescaledX_train, y_train)

In [None]:
# In[22]:


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,warm_start=False)


In [None]:
# In[23]:


from sklearn.metrics import confusion_matrix
y_pred = logreg.predict(rescaledX_test)

print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test, y_test))

# Print the confusion matrix of the logreg model
confusion_matrix(y_test, y_pred)


In [None]:
# In[24]:


import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = logreg.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


In [None]:

import statsmodels.formula.api as sm 
import statsmodels.api as sma 

data = pd.read_excel("Approval.xlsx") 
data['Debt'].fillna(data['Debt'].mean(), inplace=True)

# glm stands for Generalized Linear Model
mylogit = sm.glm(formula = "Approved ~ Debt", data = data, family = sma.families.Binomial()).fit() 

mylogit.summary()
