In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.svm import LinearSVR,SVC
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn import metrics
from lightgbm import LGBMRegressor,LGBMClassifier
import xgboost as xgb
from catboost import CatBoostRegressor,CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from tabulate import tabulate
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder

In [33]:
def mice_imputation_numeric(train_numeric):
    """
    Impute numeric data using MICE imputation with Gradient Boosting Regressor.
    (we can use any other regressors to impute the data)
    """
    iter_imp_numeric = IterativeImputer(LGBMRegressor())
    imputed_train = iter_imp_numeric.fit_transform(train_numeric)
    train_numeric_imp = pd.DataFrame(imputed_train, columns = train_numeric.columns, index= train_numeric.index)
    return train_numeric_imp

def mice_imputation_categoric(train_categoric):
    """
    Impute categoric data using MICE imputation with Gradient Boosting Classifier.
    Steps:
    1. Ordinal Encode the non-null values
    2. Use MICE imputation with Gradient Boosting Classifier to impute the ordinal encoded data
    (we can use any other classifier to impute the data)
    3. Inverse transform the ordinal encoded data.
    """
    ordinal_dict={}
    for col in train_categoric:
        '''Ordinal encode train data'''
        ordinal_dict[col] = OrdinalEncoder()
        nn_vals = np.array(train_categoric[col][train_categoric[col].notnull()]).reshape(-1,1)
        nn_vals_arr = np.array(ordinal_dict[col].fit_transform(nn_vals)).reshape(-1,)
        train_categoric[col].loc[train_categoric[col].notnull()] = nn_vals_arr

    '''Impute the data using MICE with LightGBM '''
    iter_imp_categoric = IterativeImputer(LGBMClassifier(), max_iter =5, initial_strategy='most_frequent')
    imputed_train = iter_imp_categoric.fit_transform(train_categoric)
    train_categoric_imp = pd.DataFrame(imputed_train, columns =train_categoric.columns,index = train_categoric.index).astype(int)
    
    '''Inverse Transform'''
    for col in train_categoric_imp.columns:
        oe = ordinal_dict[col]
        train_arr= np.array(train_categoric_imp[col]).reshape(-1,1)
        train_categoric_imp[col] = oe.inverse_transform(train_arr)
        
    return train_categoric_imp

In [34]:
def dep_encoding(cdata):
    le=LabelEncoder()
    cat_col_mask=cdata.columns
    for col in cat_col_mask:
        le.fit(cdata[col].unique().tolist())
        cdata[col]=le.transform(cdata[col])
    return cdata

In [35]:
def imputemissing(impdata):
    if (impdata.dtypes.name=='object'):
        categoric_cols = impdata.select_dtypes('object').columns
        impdata[categoric_cols].fillna(impdata[categoric_cols].mode()[0],inplace=True)
        impdata[categoric_cols]=dep_encoding(impdata[categoric_cols])
    if (impdata.dtypes.name=='int64' or impdata.dtypes.name=='float64'):
        numeric_cols = impdata.select_dtypes(['float64','int64']).columns
        impdata[numeric_cols].fillna(impdata[categoric_cols].median(),inplace=True)
    return impdata

In [36]:
def inde_encoding(idata):
    le=LabelEncoder()
    le.fit(idata.unique().tolist())
    idata=le.transform(idata)
    return idata

In [37]:
def scale(x1):
    col=x1.columns
    features = x1[col]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    x1[col] = features
    return x1

In [38]:
## For normal and balanced dataset
def normalsplit(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y,random_state = 42,test_size=0.20)
    return x_train,x_test,y_train,y_test

In [39]:
## For imbalance dataset
def imbsplit(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y,random_state = 42,shuffle=True,test_size=0.20,stratify=y)
    return x_train,x_test,y_train,y_test

In [40]:
def cal_reg_metrics(y_test, y_pred):
    r2=metrics.r2_score(y_true = y_test, y_pred = y_pred)
    mse=metrics.mean_squared_error(y_true = y_test, y_pred = y_pred)
    mae=metrics.mean_absolute_error(y_true = y_test, y_pred = y_pred)
    rmse=np.sqrt(mse)
    
    R2_score.append(r2)
    MSE.append(mse)
    MAE.append(mae)
    RMSE.append(rmse)

In [41]:
def cal_class_metrics(y_test,y_pred):
    acc = metrics.accuracy_score(y_true = y_test, y_pred = y_pred)
    pre = metrics.precision_score(y_true = y_test, y_pred = y_pred)
    rec = metrics.recall_score(y_true = y_test, y_pred = y_pred)
    f1  = metrics.f1_score(y_true = y_test, y_pred = y_pred)
    roc_auc = metrics.roc_auc_score(y_test,y_pred)
    log_loss = metrics.log_loss(y_true = y_test, y_pred = y_pred)
    
    accuracy.append(acc)
    precision.append(pre)
    recall.append(rec)
    f1_scor.append(f1)
    roc_auc_scor.append(roc_auc)
    logLoss.append(log_loss)

In [42]:
def calculate_kfold(model,x,y):
    #skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=54)
    cvs = cross_val_score(model, x, y, cv = 20)
    cvs_mean.append(cvs.mean())

In [43]:
def feature_importance(model, X_train):

    fI = model.feature_importances_
    
    print(fI)
    
    names = x_train.columns.values
    
    ticks = [i for i in range(len(names))]
    
    plt.bar(ticks, fI)
    
    plt.xticks(ticks, names,rotation = 90)
    
    plt.show()

In [45]:
def reg_model(x,y):
    x_train,x_test,y_train,y_test=normalsplit(x,y)
    LR=LinearRegression()
    LA=Lasso()
    RI=Ridge()
    DTR=DecisionTreeRegressor()
    KNR=KNeighborsRegressor()
    LSVR=LinearSVR()
    ETR=ExtraTreesRegressor()
    ABR=AdaBoostRegressor()
    RFR=RandomForestRegressor()
    GBR=GradientBoostingRegressor()
    lgbr=LGBMRegressor()
    xgbr=xgb.XGBRegressor()
    catbost=CatBoostRegressor(silent=True)
    
    models=[]
    models.append(("LinearRegression",LR))
    models.append(("Lasso",LA))
    models.append(("Ridge",RI))
    models.append(("DecisionTreeRegressor",DTR))
    models.append(("LinearSVR",LSVR))
    models.append(("ExtraTreeRegressor",ETR))
    models.append(("AdaBoostRegressor",ABR))
    models.append(("RandomForestRegressor",RFR))
    models.append(("GradientBoostingRegressor",GBR))
    models.append(('LGBMRegressor',lgbr))
    models.append(('XGBRegressor',xgbr))
    models.append(('CatBoostRegressor',catbost))
    
    Model = []
    for name,model in models:
        Model.append(name)
        model.fit(x_train,y_train)
        pre=model.predict(x_test)
        cal_reg_metrics(y_test, pre)
        calculate_kfold(model,x,y)
    
    results = pd.DataFrame(data = {'R2':R2_score, 'MSE': MSE ,'MAE': MAE,'kfold_mean': cvs_mean, "RMSE":RMSE}, 
                           index = Model )
    print(tabulate(results, headers =results.columns, tablefmt = 'fancy_grid')) 

In [46]:
def cla_model(x,y):
    x_train,x_test,y_train,y_test=imbsplit(x,y)
    
    LR=LogisticRegression()
    SV=SVC()
    KNN=KNeighborsClassifier()
    GNB=GaussianNB()
    RFC=RandomForestClassifier()
    DTC=DecisionTreeClassifier()
    ETC=ExtraTreesClassifier()
    GBC=GradientBoostingClassifier()
    ABC=AdaBoostClassifier()
    lgbc=LGBMClassifier()
    xgbc=xgb.XGBClassifier()
    catbost=CatBoostClassifier(silent=True)
    
    models = []
    models.append(('KNeighborsClassifier', KNN))
    models.append(('SVC', SV))
    models.append(('LogisticRegression', LR))
    models.append(('GaussianNB', GNB))
    models.append(('RandomForestClassifier', RFC))
    models.append(('DecisionTreeClassifier',DTC))
    models.append(('ExtraTreesClassifier',ETC))
    models.append(('GradientBoostingClassifier', GBC))
    models.append(('AdaBoostClassifier',ABC))
    models.append(('LGBMClassifier',lgbc))
    models.append(('XGBClassifier',xgbc))
    models.append(('CatboostClassifier',catbost))
    
    Model = []
    for name,model in models:
        Model.append(name)
        model.fit(x_train,y_train)
        pre=model.predict(x_test)
        cal_class_metrics(y_test, pre)
        cvs=calculate_kfold(model,x,y)
    
    results = pd.DataFrame(data = {'accuracy':accuracy, 'precision': precision,'recall': recall,'f1_score':f1_scor,
                                   'kfold_mean': cvs, "Roc_auc_scor":roc_auc_scor, "Log_Loss":logLoss}, 
                           index = Model )
    print(tabulate(results, headers = results.columns, tablefmt = 'fancy_grid'))

In [47]:
def percentMissingFeature(data):
    data_na = (data.isnull().sum() / len(data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    print(tabulate(missing_data, headers = ['features','Missing%'], tablefmt = 'fancy_grid'))

In [48]:
def dataflow1(df,dependent):
    percentMissingFeature(df)
    data=df.drop([dependent],axis=1)
    y=df[dependent]
    data=imputemissing(data)
    if(np.any(data.dtypes=='object')):
        categoric_cols = data1.select_dtypes('object').columns
        data[categoric_cols] =dep_encoding(data[categoric_cols])
    x=scale(data)
    if((y.dtype.name == 'int64') and (np.any(y.values>=2))):
        reg_model(x,y)
    if((y.dtype.name == 'int64') and (np.any(y.values<=2))):
        cla_model(x,y)
    if(y.dtype.name == 'object'):
        y=inde_encoding(y)
        cla_model(x,y)

In [50]:
accuracy = []
precision = []
recall = []
f1_scor = []
roc_auc_scor = []
logLoss = []
cvs_mean=[]
R2_score=[]
cvs=[]
MSE=[]
MAE=[]
RMSE=[]
REMSE=[]
df=pd.read_csv('train_meta_df.csv')
dep=input("Enter the dependent variable")
dataflow1(df,dep)

Enter the dependent variableviews
╒════════════╤════════════╕
│ features   │ Missing%   │
╞════════════╪════════════╡
╘════════════╧════════════╛
╒═══════════════════════════╤════════════╤══════════════════╤═════════╤══════════════╤══════════╕
│                           │         R2 │              MSE │     MAE │   kfold_mean │     RMSE │
╞═══════════════════════════╪════════════╪══════════════════╪═════════╪══════════════╪══════════╡
│ LinearRegression          │  0.0739078 │      1.01541e+06 │ 573.942 │   -0.0248501 │ 1007.68  │
├───────────────────────────┼────────────┼──────────────────┼─────────┼──────────────┼──────────┤
│ Lasso                     │  0.0742934 │      1.01499e+06 │ 573.899 │   -0.0238987 │ 1007.47  │
├───────────────────────────┼────────────┼──────────────────┼─────────┼──────────────┼──────────┤
│ Ridge                     │  0.0739168 │      1.0154e+06  │ 573.95  │   -0.0247852 │ 1007.67  │
├───────────────────────────┼────────────┼──────────────────┼─────────

In [None]:
data.dtypes

In [None]:
xy=y.values

In [None]:
xy