In [4]:
!pip install optuna
!pip install pandas_summary
!pip install scikit-learn==0.22
!pip install catboost
!pip install xgboost

Collecting pandas_summary
  Downloading https://files.pythonhosted.org/packages/f5/20/865d4785f86e86f7ad7ebd52fc14810ef59ef13cd959f1363a2615d1c665/pandas_summary-0.0.7-py2.py3-none-any.whl
Installing collected packages: pandas-summary
Successfully installed pandas-summary-0.0.7
Collecting scikit-learn==0.22
[?25l  Downloading https://files.pythonhosted.org/packages/2e/d0/860c4f6a7027e00acff373d9f5327f4ae3ed5872234b3cbdd7bcb52e5eff/scikit_learn-0.22-cp36-cp36m-manylinux1_x86_64.whl (7.0MB)
[K     |████████████████████████████████| 7.0MB 8.3MB/s 
Installing collected packages: scikit-learn
  Found existing installation: scikit-learn 0.22.1
    Uninstalling scikit-learn-0.22.1:
      Successfully uninstalled scikit-learn-0.22.1
Successfully installed scikit-learn-0.22
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/97/c4/586923de4634f88a31fd1b4966e15707a912b98b6f4566651b5ef58f36b5/catboost-0.20.2-cp36-none-manylinux1_x86_64.whl (63.9MB)
[K     |█████████

In [0]:
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from IPython.display import display

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import pandas as pd
from pandas import *

import sklearn
from sklearn import metrics
from sklearn_pandas.dataframe_mapper import DataFrameMapper

from pandas.core.dtypes.common import is_numeric_dtype , is_string_dtype
from sklearn.preprocessing import StandardScaler

import warnings
from sklearn.model_selection import cross_val_score

In [0]:
def reduce_mem_usage(df):
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

# year , month , day , weekday , hour , minute , second 

def fill_timestamp(df , name):
    df['year_'] = df.apply(lambda row : row[name].year , axis = 1)
    print(" Processed Year ")
    df['month_'] = df.apply(lambda row : row[name].month , axis = 1)
    print(" Processed Month ")
    df['day_'] = df.apply(lambda row : row[name].day , axis = 1)
    print(" Processed Day ")
    df['weekday_'] = df.apply(lambda row : row[name].weekday() , axis = 1)
    print(" Processed Weekday ")
    df['hour_'] = df.apply(lambda row : row[name].hour , axis = 1)
    print(" Processed Hour ")
    df['minute_'] = df.apply(lambda row : row[name].minute , axis = 1)
    print(" Processed Minute ")
    df['second_'] = df.apply(lambda row : row[name].second , axis = 1)
    print(" Processed Second ")
    df = df.drop([name] , axis = 1)
    return df

def is_timestamp(col):
    if type(col.iloc[0]) == pandas._libs.tslibs.timestamps.Timestamp:
        return True
    else :
        return False

def numericalize(df, col, name, max_n_cat):
    if is_timestamp(col):
        df = fill_timestamp(df , name)
    else :
        if not is_numeric_dtype(col) and (max_n_cat is None or col.nunique()>max_n_cat):
            df[name] = col.cat.codes+1

def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def get_sample(df,n):
    idxs = sorted(np.random.permutation(len(df))[:n])
    return df.iloc[idxs].copy()

def scale_vars(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper

def proc_df(df ,y_fld=None,skip_flds=None,ignore_flds=None,do_scale=False,na_dict=None,preproc_fn=None,max_n_cat=None,subset=None,mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)
    if na_dict is None: na_dict = {}
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    #df = pd.get_dummies(df, dummy_na=True)
    #df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

In [19]:
df_raw = pd.read_csv("train.csv")

df_raw.head()

train_cats(df_raw)

%time df, y, nas = proc_df(df_raw , "Loan_Status")

CPU times: user 15.1 ms, sys: 1e+03 ns, total: 15.1 ms
Wall time: 15.1 ms


In [34]:
import optuna

def objective(trial):
    df_raw = pd.read_csv("train.csv")
    df_raw = reduce_mem_usage(df_raw)
    train_cats(df_raw)
    df, y, nas = proc_df(df_raw , "Loan_Status")
    
    n_estimators = trial.suggest_int('n_estimators', 10, 50)
    max_depth = int(trial.suggest_loguniform('max_depth', 1, 32))
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 3, 7)
    #max_features = float(trial.suggest_loguniform('max_features', 0, 1))
    
    clf = sklearn.ensemble.RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=0.5,oob_score=True)
    
    return sklearn.model_selection.cross_val_score(clf, df, y, n_jobs=-1, cv=5).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2020-01-20 07:51:56,373][0m Finished trial#0 resulted in value: 0.809462881514061. Current best value is 0.809462881514061 with parameters: {'n_estimators': 46, 'max_depth': 1.669916059454638, 'min_samples_leaf': 7}.[0m
[32m[I 2020-01-20 07:51:56,890][0m Finished trial#1 resulted in value: 0.8061975209916034. Current best value is 0.809462881514061 with parameters: {'n_estimators': 46, 'max_depth': 1.669916059454638, 'min_samples_leaf': 7}.[0m
[32m[I 2020-01-20 07:51:57,398][0m Finished trial#2 resulted in value: 0.809462881514061. Current best value is 0.809462881514061 with parameters: {'n_estimators': 46, 'max_depth': 1.669916059454638, 'min_samples_leaf': 7}.[0m
[32m[I 2020-01-20 07:51:57,973][0m Finished trial#3 resulted in value: 0.809462881514061. Current best value is 0.809462881514061 with parameters: {'n_estimators': 46, 'max_depth': 1.669916059454638, 'min_samples_leaf': 7}.[0m
[32m[I 2020-01-20 07:51:58,439][0m Finished trial#4 resulted in value: 0.8094

Accuracy: 0.8143675862988138
Best hyperparameters: {'n_estimators': 27, 'max_depth': 5.436191627676717, 'min_samples_leaf': 5}


In [22]:
type(trial)

optuna.structs.FrozenTrial

In [0]:
??study.optimize

In [41]:
df_raw = pd.read_csv("train.csv")
df_raw = reduce_mem_usage(df_raw)
df_raw = df_raw.drop(['Loan_ID'] , axis = 1)
train_cats(df_raw)
df, y, nas = proc_df(df_raw , "Loan_Status")
model = sklearn.ensemble.RandomForestClassifier(n_estimators= 27, max_depth=5.436191627676717,min_samples_leaf= 5,max_features=0.5,oob_score=True)
model.fit(df , y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5.436191627676717,
                       max_features=0.5, max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=27,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
model.score(df , y)

0.8127035830618893

**Creating Submit file**

In [0]:
def apply_cats(df, trn):
    for n,c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name=='category'):
            df[n] = c.astype('category').cat.as_ordered()
            df[n].cat.set_categories(trn[n].cat.categories, ordered=True, inplace=True)

In [0]:
import pandas as pd

Test = pd.read_csv("/content/test_lAUu6dG.csv")

Loan_ID = list(Test.Loan_ID)

Test = Test.drop(['Loan_ID'] , axis = 1)

apply_cats(df=Test , trn=df_raw)

test , _ , _  = proc_df(Test , na_dict = nas)

In [0]:
predictions = model.predict(test)

In [0]:
def Change(val : int):
    if val == 1 :
        return "Y"
    else :
        return "N"

predictions = list(map(lambda x : Change(x) , predictions))

submission = {'Loan_ID' : Loan_ID , 'Loan_Status' : predictions}

Df = pd.DataFrame.from_dict(submission)

Df.to_csv("submission.csv" , header = True , index = False)