## Identifying the null values and removing them using **pandas**

In [None]:
import pandas as pd

In [27]:
df = pd.read_csv("loan.csv")

print(df.head())

   Gender Married Dependents  LoanAmount  Loan_Amount_Term  Credit_History  \
0    Male     Yes          0       100.0             360.0             1.0   
1  Female     NaN          1       200.0             360.0             1.0   
2    Male     Yes        NaN       300.0             360.0             0.0   
3    Male      No        NaN       400.0             360.0             1.0   
4  Female     Yes          0         NaN               NaN             1.0   

  Loan_Status  
0           Y  
1           N  
2           Y  
3           Y  
4           Y  


In [29]:
print(df.isnull().sum())

Gender              0
Married             1
Dependents          3
LoanAmount          3
Loan_Amount_Term    3
Credit_History      3
Loan_Status         4
dtype: int64


In [31]:
new_df = df.dropna()

print(new_df)

  Gender Married Dependents  LoanAmount  Loan_Amount_Term  Credit_History  \
0   Male     Yes          0       100.0             360.0             1.0   

  Loan_Status  
0           Y  


## Data preprocessing using **scikit learn**

In [None]:
import pandas as pd

In [42]:
df = pd.read_csv("loan.csv")

print(df)

    Gender Married Dependents  LoanAmount  Loan_Amount_Term  Credit_History  \
0     Male     Yes          0       100.0             360.0             1.0   
1   Female     NaN          1       200.0             360.0             1.0   
2     Male     Yes        NaN       300.0             360.0             0.0   
3     Male      No        NaN       400.0             360.0             1.0   
4   Female     Yes          0         NaN               NaN             1.0   
5     Male     Yes         3+       600.0               NaN             0.0   
6   Female     Yes          2       700.0               NaN             1.0   
7     Male      No          1       800.0             360.0             NaN   
8   Female     Yes          2       900.0             360.0             NaN   
9     Male     Yes          0      1000.0             360.0             NaN   
10    Male      No         3+      1100.0             360.0             1.0   
11  Female      No          0      1200.0           

In [44]:
print(df["LoanAmount"])

0      100.0
1      200.0
2      300.0
3      400.0
4        NaN
5      600.0
6      700.0
7      800.0
8      900.0
9     1000.0
10    1100.0
11    1200.0
12    1300.0
13       NaN
14       NaN
Name: LoanAmount, dtype: float64


### Filling the null values using the **Simple Imputer**

In [55]:
from sklearn.impute import SimpleImputer

In [56]:
imputer = SimpleImputer(strategy='mean')

imputed_loan_amt_mean = imputer.fit_transform(df[["LoanAmount"]])

print(imputed_loan_amt_mean)

[[ 100.        ]
 [ 200.        ]
 [ 300.        ]
 [ 400.        ]
 [ 716.66666667]
 [ 600.        ]
 [ 700.        ]
 [ 800.        ]
 [ 900.        ]
 [1000.        ]
 [1100.        ]
 [1200.        ]
 [1300.        ]
 [ 716.66666667]
 [ 716.66666667]]


In [57]:
imputer = SimpleImputer(strategy='median')

imputed_loan_amt_median = imputer.fit_transform(df[["LoanAmount"]])

print(imputed_loan_amt_median)

[[ 100.]
 [ 200.]
 [ 300.]
 [ 400.]
 [ 750.]
 [ 600.]
 [ 700.]
 [ 800.]
 [ 900.]
 [1000.]
 [1100.]
 [1200.]
 [1300.]
 [ 750.]
 [ 750.]]


In [58]:
imputer = SimpleImputer(strategy='most_frequent')

imputed_loan_amt_most_frequent = imputer.fit_transform(df[["LoanAmount"]])

print(imputed_loan_amt_most_frequent)

[[ 100.]
 [ 200.]
 [ 300.]
 [ 400.]
 [ 100.]
 [ 600.]
 [ 700.]
 [ 800.]
 [ 900.]
 [1000.]
 [1100.]
 [1200.]
 [1300.]
 [ 100.]
 [ 100.]]


### Filling the null values using the **SimpleImputer** for the colums with string values

In [63]:
print(df.head())

   Gender Married Dependents  LoanAmount  Loan_Amount_Term  Credit_History  \
0    Male     Yes          0       100.0             360.0             1.0   
1  Female     NaN          1       200.0             360.0             1.0   
2    Male     Yes        NaN       300.0             360.0             0.0   
3    Male      No        NaN       400.0             360.0             1.0   
4  Female     Yes          0         NaN               NaN             1.0   

  Loan_Status  
0           Y  
1           N  
2           Y  
3           Y  
4           Y  


In [69]:
string_cols = ["Gender", "Married", "Dependents", "Loan_Status"]

num_cols = ["LoanAmount", "Loan_Amount_Term", "Credit_History"]

imputer = SimpleImputer(strategy='most_frequent')

num_imputer = SimpleImputer(strategy='median')

for col in string_cols:
  df[[col]] = imputer.fit_transform(df[[col]])

for col in num_cols:
  df[[col]] = num_imputer.fit_transform(df[[col]])

print(df)

    Gender Married Dependents  LoanAmount  Loan_Amount_Term  Credit_History  \
0     Male     Yes          0       100.0             360.0             1.0   
1   Female     Yes          1       200.0             360.0             1.0   
2     Male     Yes          0       300.0             360.0             0.0   
3     Male      No          0       400.0             360.0             1.0   
4   Female     Yes          0       750.0             360.0             1.0   
5     Male     Yes         3+       600.0             360.0             0.0   
6   Female     Yes          2       700.0             360.0             1.0   
7     Male      No          1       800.0             360.0             1.0   
8   Female     Yes          2       900.0             360.0             1.0   
9     Male     Yes          0      1000.0             360.0             1.0   
10    Male      No         3+      1100.0             360.0             1.0   
11  Female      No          0      1200.0           

In [70]:
df.to_csv("updated_loan_csv")

### Scaling the values using the **Standard Scaler**

In [59]:
from sklearn.preprocessing import StandardScaler

In [61]:
scaler = StandardScaler()

scaled_loan_amt = scaler.fit_transform(imputed_loan_amt_mean)

print(scaled_loan_amt)

[[-1.79181483e+00]
 [-1.50125026e+00]
 [-1.21068569e+00]
 [-9.20121128e-01]
 [ 3.30333667e-16]
 [-3.38991994e-01]
 [-4.84274278e-02]
 [ 2.42137139e-01]
 [ 5.32701705e-01]
 [ 8.23266272e-01]
 [ 1.11383084e+00]
 [ 1.40439541e+00]
 [ 1.69495997e+00]
 [ 3.30333667e-16]
 [ 3.30333667e-16]]


### Scaling the values using the **MinMax Scaler**

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [62]:
scaler = MinMaxScaler()

scaled_loan_amt = scaler.fit_transform(imputed_loan_amt_mean)

print(scaled_loan_amt)

[[0.        ]
 [0.08333333]
 [0.16666667]
 [0.25      ]
 [0.51388889]
 [0.41666667]
 [0.5       ]
 [0.58333333]
 [0.66666667]
 [0.75      ]
 [0.83333333]
 [0.91666667]
 [1.        ]
 [0.51388889]
 [0.51388889]]


## Label Encoders in **scikit learn**

In [71]:
from sklearn.preprocessing import LabelEncoder

In [73]:
df = pd.read_csv("loan2.csv")

In [74]:
df

Unnamed: 0,Loan_ID,Location,ApplicantIncome,LoanAmount,Loan_Status
0,LP001,Rural,5000,128,Y
1,LP002,Urban,6000,150,N
2,LP003,Rural,7000,180,Y
3,LP004,Semiurban,2000,50,Y
4,LP005,Semiurban,3000,70,N
5,LP006,Urban,4000,100,Y
6,LP007,Rural,8000,200,N
7,LP008,Urban,3500,90,Y
8,LP009,Rural,9000,300,N
9,LP010,Urban,4500,120,Y


In [79]:
encoder = LabelEncoder()
df['Location'] = encoder.fit_transform(df['Location'])
print(df)

   Loan_ID  Location  ApplicantIncome  LoanAmount Loan_Status
0    LP001         0             5000         128           Y
1    LP002         2             6000         150           N
2    LP003         0             7000         180           Y
3    LP004         1             2000          50           Y
4    LP005         1             3000          70           N
5    LP006         2             4000         100           Y
6    LP007         0             8000         200           N
7    LP008         2             3500          90           Y
8    LP009         0             9000         300           N
9    LP010         2             4500         120           Y
10   LP011         0             3200          80           N
11   LP012         2             2200          60           Y
12   LP013         1             7500         250           N
13   LP014         2             6200         170           Y
14   LP015         0             2800          75           N
15   LP0

### One Hot Encoding

In [85]:
from sklearn.preprocessing import OneHotEncoder

In [80]:
df = pd.read_csv("loan3.csv")

df

Unnamed: 0,Loan_ID,Gender,Married,Education,ApplicantIncome,LoanAmount,Loan_Status
0,LP001,Male,Yes,Graduate,5000,128,Y
1,LP002,Female,Yes,Graduate,6000,150,N
2,LP003,Male,Yes,Graduate,7000,180,Y
3,LP004,Male,No,Not Graduate,2000,50,Y
4,LP005,Female,Yes,Not Graduate,3000,70,N


In [81]:
if "Gender" in df:
  print("Column Exists")
else:
  print("Column does not exists")

Column Exists


In [82]:
enocder = LabelEncoder()

df["Gender_Label_Encoded"] = enocder.fit_transform(df["Gender"])

In [83]:
df

Unnamed: 0,Loan_ID,Gender,Married,Education,ApplicantIncome,LoanAmount,Loan_Status,Gender_Label_Encoded
0,LP001,Male,Yes,Graduate,5000,128,Y,1
1,LP002,Female,Yes,Graduate,6000,150,N,0
2,LP003,Male,Yes,Graduate,7000,180,Y,1
3,LP004,Male,No,Not Graduate,2000,50,Y,1
4,LP005,Female,Yes,Not Graduate,3000,70,N,0


In [91]:
encoder = OneHotEncoder()

df[["Gender_Male"]] = encoder.fit_transform(df[["Gender"]])

df

Unnamed: 0,Loan_ID,Gender,Married,Education,ApplicantIncome,LoanAmount,Loan_Status,Gender_Label_Encoded,Gender_Male
0,LP001,Male,Yes,Graduate,5000,128,Y,1,<Compressed Sparse Row sparse matrix of dtype ...
1,LP002,Female,Yes,Graduate,6000,150,N,0,<Compressed Sparse Row sparse matrix of dtype ...
2,LP003,Male,Yes,Graduate,7000,180,Y,1,<Compressed Sparse Row sparse matrix of dtype ...
3,LP004,Male,No,Not Graduate,2000,50,Y,1,<Compressed Sparse Row sparse matrix of dtype ...
4,LP005,Female,Yes,Not Graduate,3000,70,N,0,<Compressed Sparse Row sparse matrix of dtype ...
