## Logging in Python

In [1]:
#import libraries
import numpy as np  
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings  # To ignore any warnings
warnings.filterwarnings("ignore")
# classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from imblearn.ensemble import (
    BalancedBaggingClassifier,
    EasyEnsembleClassifier,
)
import os
import logging

In [2]:
#read data
data = pd.read_csv("loans_data.csv")
data.isnull().sum()
logging_filename='loan_data.log'

### Data Pre-processing

In [3]:
def preprocess(data,DV,scale,columns_to_scale,drop,binary_cols=[],test_ratio=0.30):
    
    """
    #write a function that does all pre-processing
#1) fills all Categorical nulls with mode
#2) fills null in numeric with median
#3) replaces Y /N with 1/0
#4) does scaling if scale=True ( z values for numerical columns)
#5) divides data into train test on a test_ratio input
#6) creates dummies for all categorical variables
#7) returns processed train & test data
#8) drops any column if required

#data, scale=T/F, drop= give column names to be dropped,binary cols will be a list of binary Y/N columns
#DV is the response variable. #columns_to_scale is a list of columns we need to scale
    """
    
    #delete any unwanted columns/id columns
    data.drop(drop,axis=1,inplace=True)
    
    #some adhoc requirements of this particualr data
    data['Dependents'].replace('3+',3,inplace=True)
    
    #for cat var replace mulls with mode
    for i in (data.columns[data.dtypes==object]):
        if(data[i].isnull().sum()>0):
            data[i].fillna(data[i].mode()[0],inplace=True)
    else:
        #for numeric var replace mulls with median
        for i in (data.columns[data.dtypes!=object]):
            if(data[i].isnull().sum()>0):
                data[i].fillna(data[i].median(),inplace=True)
        
        #replace Yes/No with 1/0
    if len(binary_cols)>0:
        for i in binary_cols:
            if(data[i][0] in ('Y','N')):
                data[i]=data[i].map({'Y':1,'N':0})
            else:
                data[i]=data[i].map({'Yes':1,'No':0})
                
    #divide data into train test
    trainX,testX,trainY,testY=train_test_split(data.drop(DV,axis=1),data[DV],test_size=test_ratio,random_state=11)
    
    #scale data
    if scale:
        train_X_to_scale=trainX[columns_to_scale]
        train_X_rest=trainX.drop(columns_to_scale,axis=1)
        scaler = preprocessing.StandardScaler()
        names = train_X_to_scale.columns
        #scale trainX
        scaler.fit(train_X_to_scale)
        scaled_train_X = scaler.transform(train_X_to_scale)
        scaled_train_X = pd.DataFrame(scaled_train_X, columns=names)
        train_X=pd.concat([scaled_train_X.reset_index(drop=True),train_X_rest.reset_index(drop=True)],axis=1)
        #scale testX
        test_X_to_scale=testX[columns_to_scale]
        test_X_rest=testX.drop(columns_to_scale,axis=1)
        scaled_test_X = scaler.transform(test_X_to_scale)
        scaled_test_X = pd.DataFrame(scaled_test_X, columns=names)
        test_X =pd.concat([scaled_test_X.reset_index(drop=True),test_X_rest.reset_index(drop=True)],axis=1)
        
        #merge data together
        train_X['dataset']='Train'
        test_X['dataset']= 'Test'
        data_train=pd.concat([train_X.reset_index(drop=True),trainY.reset_index(drop=True)],axis=1)
        data_test=pd.concat([test_X.reset_index(drop=True),testY.reset_index(drop=True)],axis=1)
        data_full=pd.concat([data_train.reset_index(drop=True),data_test.reset_index(drop=True)],axis=0)
        
    else:
        trainX['dataset']='Train'
        testX['dataset']= 'Test'
        data_train=pd.concat([trainX,trainY],axis=1)
        data_test=pd.concat([testX,testY],axis=1)
        data_full=pd.concat([data_train.reset_index(drop=True),data_test.reset_index(drop=True)],axis=0)
    
    #create dummmies
    data_final1=pd.get_dummies(data_full.drop('dataset',axis=1),drop_first=True)
    data_final=pd.concat([data_final1.reset_index(drop=True),data_full['dataset'].reset_index(drop=True)],axis=1)
    return(data_final)  

In [4]:
columns_to_scale=['ApplicantIncome','LoanAmount','CoapplicantIncome','Loan_Amount_Term']
binary_cols=['Loan_Status','Self_Employed','Married']

df=preprocess(data,'Loan_Status',True,columns_to_scale,'Loan_ID',binary_cols,test_ratio=0.90)
#preprocess(data,DV,scale,columns_to_scale,drop=[],binary_cols=[],test_ratio=0.30)

In [5]:
df.isnull().sum() #no nulls

ApplicantIncome            0
LoanAmount                 0
CoapplicantIncome          0
Loan_Amount_Term           0
Married                    0
Self_Employed              0
Credit_History             0
Loan_Status                0
Gender_Male                0
Dependents_0               0
Dependents_1               0
Dependents_2               0
Education_Not Graduate     0
Property_Area_Semiurban    0
Property_Area_Urban        0
dataset                    0
dtype: int64

In [6]:
#data is scaled as well
df.describe()

Unnamed: 0,ApplicantIncome,LoanAmount,CoapplicantIncome,Loan_Amount_Term,Married,Self_Employed,Credit_History,Loan_Status,Gender_Male,Dependents_0,Dependents_1,Dependents_2,Education_Not Graduate,Property_Area_Semiurban,Property_Area_Urban
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,0.023573,0.229472,0.168209,0.166727,0.653094,0.13355,0.855049,0.688925,0.81759,0.586319,0.166124,0.164495,0.218241,0.379479,0.32899
std,1.185967,1.278444,1.859176,0.773593,0.476373,0.340446,0.352339,0.463311,0.386497,0.492894,0.372495,0.371027,0.413389,0.485653,0.470229
min,-0.996298,-1.849187,-0.861841,-3.800502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.466799,-0.462172,-0.861841,0.377924,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.285285,-0.040368,-0.106734,0.377924,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.099584,0.518238,0.597704,0.377924,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
max,14.699366,8.654127,25.611055,1.818761,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Logging file creation

In [7]:
#create a log file to log important information
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s: %(message)s',
    filename=logging_filename
    ,filemode='w'
)

#logging.getLoggerClass().root.handlers[0].baseFilename
#above gives the location of the log file
logging.info('Logging file created')

In [8]:
train_X=df[df['dataset']=='Train'].drop('dataset',axis=1)
train_Y=df[df['dataset']=='Train']['Loan_Status']
test_X=df[df['dataset']=='Test'].drop('dataset',axis=1)
test_Y=df[df['dataset']=='Test']['Loan_Status']

### Modelling

In [9]:
models={
    "KNN": KNeighborsClassifier(),
    "RF" : RandomForestClassifier(),
    "XGB": XGBClassifier(),
    "LogR":LogisticRegression(),
    "GB" : GradientBoostingClassifier()
}

In [10]:
for model_name,model in models.items():
    logging.info("Model {}".format(model_name)) # this will log the model running
    scores=cross_val_score(model,X=train_X,y=train_Y,scoring='accuracy')
    logging.info("Mean score on TRAIN data is {}".format(scores.mean()))
    model.fit(train_X,train_Y)
    test_accuracy_score=accuracy_score(test_Y, model.predict(test_X))
    logging.info("Accuracy Score on TEST data is {}".format(test_accuracy_score))
    logging.info("---------------------------------------------------")

In [5]:
from sklearn import datasets
import pandas as pd
iris=datasets.load_iris()
iris.data=iris.data
iris.target=iris.target

In [6]:
pd.DataFrame(iris.data)

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1
