# AIML Project - Bank and Finance - Potential Customer prediction

### 1. Import and warehouse data:

In [1]:
import pandas as pd
import numpy as np

df_per = pd.read_csv('/content/Data1.csv')
df_fin = pd.read_csv('/content/Data2.csv')

In [2]:
# Top 5 rows
df_per.head()

Unnamed: 0,ID,Age,CustomerSince,HighestSpend,ZipCode,HiddenScore,MonthlyAverageSpend,Level
0,1,25,1,49,91107,4,1.6,1
1,2,45,19,34,90089,3,1.5,1
2,3,39,15,11,94720,1,1.0,1
3,4,35,9,100,94112,1,2.7,2
4,5,35,8,45,91330,4,1.0,2


In [3]:
# Top 5 rows
df_fin.head()

Unnamed: 0,ID,Mortgage,Security,FixedDepositAccount,InternetBanking,CreditCard,LoanOnCard
0,1,0,1,0,0,0,
1,2,0,1,0,0,0,
2,3,0,0,0,0,0,
3,4,0,0,0,0,0,
4,5,0,0,0,0,1,


In [4]:
# Merge both the data frames

df = pd.merge(df_per, df_fin, on='ID')
df.head()

Unnamed: 0,ID,Age,CustomerSince,HighestSpend,ZipCode,HiddenScore,MonthlyAverageSpend,Level,Mortgage,Security,FixedDepositAccount,InternetBanking,CreditCard,LoanOnCard
0,1,25,1,49,91107,4,1.6,1,0,1,0,0,0,
1,2,45,19,34,90089,3,1.5,1,0,1,0,0,0,
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,1,


In [5]:
# Shape and size of the dataframe
print("> Shape:",df.shape)
print("\n> Size:",df.size)

> Shape: (5000, 14)

> Size: 70000


In [6]:
# Column names
df.columns

Index(['ID', 'Age', 'CustomerSince', 'HighestSpend', 'ZipCode', 'HiddenScore',
       'MonthlyAverageSpend', 'Level', 'Mortgage', 'Security',
       'FixedDepositAccount', 'InternetBanking', 'CreditCard', 'LoanOnCard'],
      dtype='object')

In [7]:
# Data types of the attributes
df.dtypes

ID                       int64
Age                      int64
CustomerSince            int64
HighestSpend             int64
ZipCode                  int64
HiddenScore              int64
MonthlyAverageSpend    float64
Level                    int64
Mortgage                 int64
Security                 int64
FixedDepositAccount      int64
InternetBanking          int64
CreditCard               int64
LoanOnCard             float64
dtype: object

### 2. Data cleansing 

In [8]:
# check for null value
df.isnull().sum()

ID                      0
Age                     0
CustomerSince           0
HighestSpend            0
ZipCode                 0
HiddenScore             0
MonthlyAverageSpend     0
Level                   0
Mortgage                0
Security                0
FixedDepositAccount     0
InternetBanking         0
CreditCard              0
LoanOnCard             20
dtype: int64

In [9]:
# Original dataframe size
Original_size = df.size

# drop dataframe
df.dropna(inplace=True)

# New dataframe size
New_size=df.size

print("> Original:",Original_size,"vs","New:",New_size)

# Data drop impact analysis

Data_lost=100*((Original_size-New_size)/Original_size)
print("> Percentage of data lost",Data_lost,"%")
if Data_lost<=5:
    print("\n> Not much data has been lost,dropping data was a good option")
else:
    print("\n> Considerable data has been lost,dropping data was not a good option. We need to impute data")

> Original: 70000 vs New: 69720
> Percentage of data lost 0.4 %

> Not much data has been lost,dropping data was a good option


In [10]:
# Correcting the  negative numbers in attribute "experience"
df['CustomerSince'] = df['CustomerSince'].abs()

### 3. Data analysis & visualisation: 

In [11]:
# Exhaustive EDA

In [12]:
# Exhaustive statistical analysis

### 4. Data pre-processing:

In [13]:
# Segregate predictors vs target attributes

X_df = df.loc[:, df.columns != 'LoanOnCard']
y_df = df['LoanOnCard']

In [14]:
# Check for target balancing and fix it if found imbalanced.
y_df.value_counts()

0.0    4500
1.0     480
Name: LoanOnCard, dtype: int64

In [None]:
# Up-sample

from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[y_df==0]
df_minority = df[y_df==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=4400,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled['LoanOnCard'].value_counts()

0.0    4500
1.0    4400
Name: LoanOnCard, dtype: int64

In [None]:
# top 5 rows fo the dataframe
df_upsampled.head()

Unnamed: 0,ID,Age,CustomerSince,HighestSpend,ZipCode,HiddenScore,MonthlyAverageSpend,Level,Mortgage,Security,FixedDepositAccount,InternetBanking,CreditCard,LoanOnCard
10,11,65,39,105,94710,4,2.4,3,0,0,0,0,0,0.0
11,12,29,5,45,90277,3,0.1,2,0,0,0,1,0,0.0
12,13,48,23,114,93106,2,3.8,3,0,1,0,0,0,0.0
13,14,59,32,40,94920,4,2.5,2,0,0,0,1,0,0.0
14,15,67,41,112,91741,1,2.0,1,0,1,0,0,0,0.0


In [None]:
# Shape and size of the dataframe
print("> Shape:",df_upsampled.shape)
print("\n> Size:",df_upsampled.size)

> Shape: (8900, 14)

> Size: 124600


In [None]:
# Down-sample

from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[y_df==0]
df_minority = df[y_df==1]
 
# Upsample minority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=600,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled['LoanOnCard'].value_counts()

In [None]:
# top 5 rows fo the dataframe
df_downsampled.head()

In [None]:
# Shape and size of the dataframe
print("> Shape:",df_downsampled.shape)
print("\n> Size:",df_downsampled.size)

In [None]:
#Segregate predictors vs target attributes

X_df = df_upsampled.loc[:, df.columns != 'LoanOnCard']  # choose appropriate dataframe
y_df = df_upsampled['LoanOnCard']

In [None]:
# train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_df,y_df,test_size = 0.3, random_state=1)

### 5. Model training, testing and tuning: 

In [None]:
# Logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

model = LogisticRegression()
model.fit(x_train, y_train)

model_score = model.score(x_train, y_train)
y_predict = model.predict(x_train)
print("\nTest accuracy:",round(100*model_score,2),"%")
print("\nConfusion matrix:\n\n",metrics.confusion_matrix(y_train, y_predict))

model_score = model.score(x_test, y_test)
y_predict = model.predict(x_test)

print("\nTest accuracy:",round(100*model_score,2),"%")
print("\nConfusion matrix:\n\n",metrics.confusion_matrix(y_test, y_predict))


Test accuracy: 86.71 %

Confusion matrix:

 [[2668  475]
 [ 353 2734]]

Test accuracy: 86.59 %

Confusion matrix:

 [[1145  212]
 [ 146 1167]]


In [None]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train, y_train)

model_score = model.score(x_train, y_train)
y_predict = model.predict(x_train)
print("\nTest accuracy:",round(100*model_score,2),"%")
print("\nConfusion matrix:\n\n",metrics.confusion_matrix(y_train, y_predict))

model_score = model.score(x_test, y_test)
y_predict = model.predict(x_test)

print("\nTest accuracy:",round(100*model_score,2),"%")
print("\nConfusion matrix:\n\n",metrics.confusion_matrix(y_test, y_predict))


Test accuracy: 83.29 %

Confusion matrix:

 [[2770  373]
 [ 668 2419]]

Test accuracy: 83.75 %

Confusion matrix:

 [[1196  161]
 [ 273 1040]]
