In [1]:
import logging
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


In [2]:
logging.basicConfig(level = logging.INFO, format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("Dev-Notebook")

In [3]:
raw_train_data = pd.read_csv("train.csv")
raw_train_data.head()
                         

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
raw_train_data.shape

(614, 13)

In [5]:
raw_train_data.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [6]:
raw_test_data = pd.read_csv("test.csv")
raw_test_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78,360.0,1.0,Urban


In [7]:
train_df = raw_train_data.copy()
test_df = raw_test_data.copy()

In [8]:
train_df.info() # for training

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [9]:
test_df.info() # only for prediction

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            362 non-null    object 
 1   Gender             351 non-null    object 
 2   Married            362 non-null    object 
 3   Dependents         353 non-null    object 
 4   Education          362 non-null    object 
 5   Self_Employed      339 non-null    object 
 6   ApplicantIncome    362 non-null    int64  
 7   CoapplicantIncome  362 non-null    int64  
 8   LoanAmount         362 non-null    int64  
 9   Loan_Amount_Term   356 non-null    float64
 10  Credit_History     333 non-null    float64
 11  Property_Area      362 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 34.1+ KB


In [10]:
train_y  =  train_df['Loan_Status'].copy()

In [11]:
train_df.drop(columns=['Loan_Status'], inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [12]:
train_df.drop(columns=['Loan_ID'], inplace=True)
test_df.drop(columns=['Loan_ID'], inplace=True)

In [13]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [14]:
test_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [15]:
train_df[train_df.duplicated()] # no duplicates

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [16]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
192,Male,No,0,Graduate,Yes,5833,0,116,360.0,1.0,Urban


In [17]:
test_df.drop_duplicates(inplace=True)

In [18]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [19]:
train_df.nunique()

Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
dtype: int64

In [20]:
num_cols = list(train_df.select_dtypes(['int', 'float']).columns)
num_cols = num_cols[:4]
num_cols

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

In [21]:
categorical_col = list(train_df.select_dtypes('object').columns)
categorical_col.append('Credit_History')
categorical_col

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Credit_History']

In [22]:
cat_imputer = SimpleImputer(strategy='most_frequent')
cat_imputer.fit(train_df[categorical_col])

train_df[categorical_col] = cat_imputer.transform(train_df[categorical_col])
test_df[categorical_col] = cat_imputer.transform(test_df[categorical_col])

In [23]:
cat_imputer = SimpleImputer(strategy='mean')
cat_imputer.fit(train_df[num_cols])

train_df[num_cols] = cat_imputer.transform(train_df[num_cols])
test_df[num_cols] = cat_imputer.transform(test_df[num_cols])

In [24]:
train_df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [25]:
# preprocessing as per the domain knowledge
train_df["Applicant-Income"] = train_df["ApplicantIncome"] + train_df["CoapplicantIncome"]
test_df["Applicant-Income"] = test_df["ApplicantIncome"] + test_df["CoapplicantIncome"]

In [26]:
train_df.drop(['ApplicantIncome', "CoapplicantIncome"], axis=1, inplace=True)
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Applicant-Income
0,Male,No,0,Graduate,No,146.412162,360.0,1.0,Urban,5849.0
1,Male,Yes,1,Graduate,No,128.0,360.0,1.0,Rural,6091.0
2,Male,Yes,0,Graduate,Yes,66.0,360.0,1.0,Urban,3000.0
3,Male,Yes,0,Not Graduate,No,120.0,360.0,1.0,Urban,4941.0
4,Male,No,0,Graduate,No,141.0,360.0,1.0,Urban,6000.0


In [27]:
test_df.drop(['ApplicantIncome', "CoapplicantIncome"], axis=1, inplace=True)
test_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Applicant-Income
0,Male,Yes,0,Graduate,No,110.0,360.0,1.0,Urban,5720.0
1,Male,Yes,1,Graduate,No,126.0,360.0,1.0,Urban,4576.0
2,Male,Yes,2,Graduate,No,208.0,360.0,1.0,Urban,6800.0
3,Male,Yes,2,Graduate,No,100.0,360.0,1.0,Urban,4886.0
4,Male,No,0,Not Graduate,No,78.0,360.0,1.0,Urban,3276.0


In [28]:
train_df.nunique()

Gender                2
Married               2
Dependents            4
Education             2
Self_Employed         2
LoanAmount          204
Loan_Amount_Term     11
Credit_History        2
Property_Area         3
Applicant-Income    554
dtype: int64

In [29]:
train_df.Dependents.unique()

array(['0', '1', '2', '3+'], dtype=object)

In [30]:
train_df.Property_Area.unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [31]:
for col in categorical_col:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [32]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Applicant-Income
0,1,0,0,0,0,146.412162,360.0,1,2,5849.0
1,1,1,1,0,0,128.0,360.0,1,0,6091.0
2,1,1,0,0,1,66.0,360.0,1,2,3000.0
3,1,1,0,1,0,120.0,360.0,1,2,4941.0
4,1,0,0,0,0,141.0,360.0,1,2,6000.0


In [33]:
# log transformations
# train_df[num_cols] = np.log()
num_cols

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

In [34]:
num_cols.remove("CoapplicantIncome")

In [35]:
num_cols[0] = "Applicant-Income"
num_cols

['Applicant-Income', 'LoanAmount', 'Loan_Amount_Term']

In [36]:
train_df[num_cols] = np.log(train_df[num_cols])
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Applicant-Income
0,1,0,0,0,0,4.986426,5.886104,1,2,8.674026
1,1,1,1,0,0,4.85203,5.886104,1,0,8.714568
2,1,1,0,0,1,4.189655,5.886104,1,2,8.006368
3,1,1,0,1,0,4.787492,5.886104,1,2,8.505323
4,1,0,0,0,0,4.94876,5.886104,1,2,8.699515


In [37]:
test_df[num_cols] = np.log(test_df[num_cols])
test_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Applicant-Income
0,1,1,0,0,0,4.70048,5.886104,1,2,8.651724
1,1,1,1,0,0,4.836282,5.886104,1,2,8.428581
2,1,1,2,0,0,5.337538,5.886104,1,2,8.824678
3,1,1,2,0,0,4.60517,5.886104,1,2,8.494129
4,1,0,0,1,0,4.356709,5.886104,1,2,8.094378


In [38]:
# minmax_scaling
minmax = MinMaxScaler()
train_df[num_cols] = minmax.fit_transform(train_df[num_cols])
test_df[num_cols] = minmax.transform(test_df[num_cols])

In [39]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Applicant-Income
0,1,0,0,0,0,0.640628,0.922014,1,2,0.34759
1,1,1,1,0,0,0.60976,0.922014,1,0,0.357654
2,1,1,0,0,1,0.457624,0.922014,1,2,0.181853
3,1,1,0,1,0,0.594936,0.922014,1,2,0.305712
4,1,0,0,0,0,0.631977,0.922014,1,2,0.353918


In [40]:
# Building the model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df, train_y, test_size=0.25,
                                                    random_state=212, stratify=train_y)

In [41]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(X_train, y_train)

In [42]:
y_pred = log.predict(X_test)

In [43]:
from sklearn.metrics import  accuracy_score

acc = accuracy_score(y_test, y_pred)
logger.info(f"Accuracy score of the model: {acc}")

2024-03-24 08:10:20,948 - Dev-Notebook - INFO - Accuracy score of the model: 0.8181818181818182


In [44]:
## serialization && deserialization
import joblib

# serialization
joblib.dump(log, "loan_classifiaction_model.pkl")

['loan_classifiaction_model.pkl']

In [45]:
# deserialization
final_model = joblib.load("loan_classifiaction_model.pkl")

In [46]:
final_model.intercept_, final_model.coef_

(array([-1.83952422]),
 array([[ 0.0048297 ,  0.43875344,  0.01821385, -0.28853974,  0.09556142,
         -0.17668837, -0.30926559,  3.13392751,  0.17083047, -0.12862218]]))

In [47]:
log.intercept_, log.coef_

(array([-1.83952422]),
 array([[ 0.0048297 ,  0.43875344,  0.01821385, -0.28853974,  0.09556142,
         -0.17668837, -0.30926559,  3.13392751,  0.17083047, -0.12862218]]))

In [48]:
from mypackage import f1

In [49]:
logger.info(f1.print_smthng())

2024-03-24 08:10:43,794 - Dev-Notebook - INFO - None


Output 1


In [50]:
import mypackage

In [51]:
mypackage.__file__

'C:\\Users\\hp\\Desktop\\MLOps\\mypackage\\__init__.py'

In [56]:
import pathlib
pathlib.Path(mypackage.__file__).resolve().parent

WindowsPath('C:/Users/hp/Desktop/MLOps/mypackage')

In [57]:
X_train.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',
       'Applicant-Income'],
      dtype='object')

In [58]:
num_cols

['Applicant-Income', 'LoanAmount', 'Loan_Amount_Term']

In [59]:
categorical_col

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Credit_History']

## Create Custom Data Transformer

In [60]:
# Key things --> BaseEstimator, TransformerMixin
# implement fit and transform
# accept input with __init__ method

from sklearn.base import TransformerMixin, BaseEstimator

class DemoTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X
        

In [74]:
# Numerical- Imputation Mean

class MeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        self.variables = variables

    def fit(self, X, y=None):
        self.mean_dict = {}
        for col in self.variables:
            self.mean_dict[col] = X[col].mean()
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.variables:
            X[col].fillna(self.mean_dict[col], inplace=True)
        return X

In [82]:
np.random.seed(123)
df = pd.DataFrame(np.random.randint(0,100, (10,2)), columns=["A","B"])
df.iloc[1,0]= np.nan
df.iloc[2,0]= np.nan
df.iloc[5,1]=np.nan
df.iloc[2,1]= np.nan
df

Unnamed: 0,A,B
0,66.0,92.0
1,,17.0
2,,
3,86.0,97.0
4,96.0,47.0
5,73.0,
6,46.0,96.0
7,25.0,83.0
8,78.0,36.0
9,96.0,80.0


In [83]:
mean_imputer = MeanImputer(variables=["A","B"])

In [84]:
mean_imputer.fit(df)

In [85]:
mean_imputer.mean_dict

{'A': 70.75, 'B': 68.5}

In [86]:
df.mean()

A    70.75
B    68.50
dtype: float64

In [89]:
mean_imputer.transform(df)

Unnamed: 0,A,B
0,66.0,92.0
1,70.75,17.0
2,70.75,68.5
3,86.0,97.0
4,96.0,47.0
5,73.0,68.5
6,46.0,96.0
7,25.0,83.0
8,78.0,36.0
9,96.0,80.0


In [90]:
pd.__version__

'1.4.3'

In [96]:
import sklearn
sklearn.__version__

'1.1.1'

In [97]:
joblib.__version__

'1.3.2'

In [100]:
logging.__version__

'0.5.1.2'