In [None]:
import pandas as pd #Data manipulation lib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import RandomOverSampler, SMOTE 

In [None]:
#Import Data  
df = pd.read_csv('https://raw.githubusercontent.com/SuperDataWorld/Python/main/Data/loanpred.csv')
df.dropna(axis=1, inplace = True)
df.drop(['ID','Lead_Creation_Date'], axis=1, inplace= True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69713 entries, 0 to 69712
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           69713 non-null  object 
 1   Monthly_Income   69713 non-null  float64
 2   Contacted        69713 non-null  object 
 3   Source           69713 non-null  object 
 4   Source_Category  69713 non-null  object 
 5   Var1             69713 non-null  int64  
 6   Approved         69713 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 3.7+ MB


In [None]:
df.describe()

Unnamed: 0,Monthly_Income,Var1,Approved
count,69713.0,69713.0,69713.0
mean,5622.283,3.948446,0.014631
std,174767.1,3.819214,0.120073
min,0.0,0.0,0.0
25%,1650.0,0.0,0.0
50%,2500.0,2.0,0.0
75%,4000.0,7.0,0.0
max,38383840.0,10.0,1.0


In [None]:
df.head()

Unnamed: 0,Gender,Monthly_Income,Contacted,Source,Source_Category,Var1,Approved
0,Female,2000.0,N,S122,G,0,0
1,Male,3500.0,Y,S122,G,10,0
2,Male,2250.0,Y,S143,B,0,0
3,Male,3500.0,Y,S143,B,7,0
4,Male,10000.0,Y,S134,B,10,0


In [None]:
# Encode Categorical Fields

# Categorical Encoding 
for i in ['Gender','Contacted','Source','Source_Category','Var1']:
  df[i] = df[i].astype('category').cat.codes

In [None]:
X = df.drop(['Approved'], axis=1)
y = df[['Approved']]

In [None]:
y.value_counts()

Approved
0           68693
1            1020
dtype: int64

In [None]:
round(1020/68693,4)

0.0148

### **Stratified Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=21)
print(y_train.value_counts())  
print(y_test.value_counts())

Approved
0           54929
1             841
dtype: int64
Approved
0           13764
1             179
dtype: int64


In [None]:
round(841/54929,4)

0.0153

In [None]:
round(179/13764,4)

0.013

In [None]:
splitter=StratifiedShuffleSplit(n_splits=1,random_state=12, test_size=0.20) 
for train,test in splitter.split(X,y):     
    X_train = X.iloc[train]
    y_train = y.iloc[train]
    X_test = X.iloc[test]
    y_test = y.iloc[test]
print(y_train.value_counts())  
print(y_test.value_counts())

Approved
0           54954
1             816
dtype: int64
Approved
0           13739
1             204
dtype: int64


In [None]:
816/54954

0.01484878261818976

In [None]:
204/13739

0.014848242230147754

### **SMOTE Sample**

In [None]:
# SMOTE Resample
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X_train, y_train)
print(y_sm.value_counts()) 

Approved
0           54954
1           54954
dtype: int64


In [None]:
X_sm.tail(10)

Unnamed: 0,Gender,Monthly_Income,Contacted,Source,Source_Category,Var1
109898,0,4900.0,0,0,6,2
109899,0,10610.588107,0,11,1,4
109900,0,2800.0,0,0,6,2
109901,1,4000.0,1,0,6,3
109902,0,2500.0,0,6,1,3
109903,0,8000.0,0,8,1,4
109904,1,4000.0,1,24,2,3
109905,0,3028.71105,1,19,1,2
109906,0,15000.0,1,4,2,3
109907,1,5800.0,1,2,1,3


In [None]:
y_sm.tail(10)

Unnamed: 0,Approved
109898,1
109899,1
109900,1
109901,1
109902,1
109903,1
109904,1
109905,1
109906,1
109907,1


### **Random oversampling sample**

In [None]:
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)

In [None]:
print(y_over.value_counts())

Approved
0           54954
1           54954
dtype: int64


In [None]:
X_over.tail(10)

Unnamed: 0,Gender,Monthly_Income,Contacted,Source,Source_Category,Var1
109898,1,6000.0,1,0,6,3
109899,0,5100.0,0,0,6,0
109900,1,5000.0,1,6,1,4
109901,1,4900.0,1,6,1,4
109902,1,2180.0,1,6,2,2
109903,1,4200.0,1,6,1,4
109904,0,6300.0,0,6,1,0
109905,1,3200.0,1,0,6,3
109906,1,5090.0,1,6,1,4
109907,0,3500.0,0,0,1,0


In [None]:
X_over.query('Monthly_Income == 6500 & Contacted == 1 & Source == 6 & Var1 == 4 & Gender == 1 & Source_Category == 1').count()

Gender             231
Monthly_Income     231
Contacted          231
Source             231
Source_Category    231
Var1               231
dtype: int64

In [None]:
X_sm.query('Monthly_Income == 6500 & Contacted == 1 & Source == 6 & Var1 == 4 & Gender == 1 & Source_Category == 1').count()

Gender             151
Monthly_Income     151
Contacted          151
Source             151
Source_Category    151
Var1               151
dtype: int64

In [None]:
X.query('Monthly_Income == 6500 & Contacted == 1 & Source == 6 & Var1 == 4 & Gender == 1 & Source_Category == 1').count()

Gender             48
Monthly_Income     48
Contacted          48
Source             48
Source_Category    48
Var1               48
dtype: int64