In [298]:
import pandas as pd
import numpy as np
import collections
import datetime as dt
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

### Functions on bureau dataset

In [299]:
def max_comp(list_values):
    ctr = collections.Counter(list_values)
    return max(ctr, key=ctr.get)

def if_one(list_values):
    if 1 in list_values:
        return 1
    else:
        return 0
    
def is_loan(list_values):
    loan = []
    for i in list_values:
        if 'Loan' in i:
            loan.append(i)
    return loan

def one_hot(df,variable,top_x_labels):
    for label in top_x_labels:
        df[variable+"_"+label] = np.where(data[variable]== label,1,0)
    df.drop([variable], axis=1, inplace=True)


In [300]:
def prerequisites_for_one_hot_bureau(data):
    # returns the list of some columns
    all_loans = is_loan(data["ACCT-TYPE"].value_counts().index)
    all_contributors = data["CONTRIBUTOR-TYPE"].value_counts().index
    top_3_account_status = data["ACCOUNT-STATUS"].value_counts().sort_values(ascending = False).head(3).index
    top_3_asset_classes = data["ASSET_CLASS"].value_counts().sort_values(ascending = False).head(3).index
    top_3_ownerships = data["OWNERSHIP-IND"].value_counts().sort_values(ascending = False).head(3).index
    return all_loans, all_contributors, top_3_account_status, top_3_asset_classes, top_3_ownerships

In [301]:
def preprocessing_bureau(data, all_loans, all_contributors, top_3_account_status, top_3_asset_classes, top_3_ownerships):
    # final preprocessing on bureau data
    data["SELF-INDICATOR"] = pd.get_dummies(data["SELF-INDICATOR"],drop_first= True)
    data["MATCH-TYPE"] = pd.get_dummies(data["MATCH-TYPE"], drop_first=True)
    one_hot(data,"ACCT-TYPE", all_loans)
    one_hot(data,"CONTRIBUTOR-TYPE",all_contributors)
    one_hot(data,"ACCOUNT-STATUS",top_3_account_status)
    one_hot(data,"OWNERSHIP-IND", top_3_ownerships)
    data['CREDIT-LIMIT/SANC AMT'] = data['CREDIT-LIMIT/SANC AMT'].str.replace(",","").fillna(0).astype('int')
    data['DISBURSED-AMT/HIGH CREDIT'] = data['DISBURSED-AMT/HIGH CREDIT'].str.replace(",","").fillna(0).astype('int')
    data["INSTALLMENT-AMT"] = data['INSTALLMENT-AMT'].str.split("/").str[0]
    data['INSTALLMENT-AMT'] = data['INSTALLMENT-AMT'].str.replace(",","").fillna(0).astype('int')
    data["CURRENT-BAL"] = data["CURRENT-BAL"].str.replace(',',"").fillna(0).astype('int')
    data["OVERDUE-AMT"]=data['OVERDUE-AMT'].str.replace(",","").fillna(0).astype('int')
    data["WRITE-OFF-AMT"]=data["WRITE-OFF-AMT"].fillna(0).astype('int')
    one_hot(data,"ASSET_CLASS",top_3_asset_classes)
    output = data.groupby(["ID"]).agg(
        TENURE = ("TENURE", sum),
        DISBURSED_AMT_HIGH_CREDIT = ("DISBURSED-AMT/HIGH CREDIT", sum),
        SELF_INICATOR = ("SELF-INDICATOR", np.mean),
        MATCH_TYPE = ("MATCH-TYPE", np.mean),
        DATE_REPORTED = ("DATE-REPORTED", max),
        DISBURSED_DT = ("DISBURSED-DT", max),
        LAST_PAYMENT_DATE = ("LAST-PAYMENT-DATE", max),
        CREDIT_LIMIT_SANC_AMT = ("CREDIT-LIMIT/SANC AMT", sum),
        INSTALLMENT_AMT = ("INSTALLMENT-AMT", sum),
        CURRENT_BAL = ("CURRENT-BAL", max),
        OVERDUE_AMT = ("OVERDUE-AMT", max),
        WRITE_OFF_AMT = ("WRITE-OFF-AMT", sum),
        OWNERSHIP_IND_Individual = ("OWNERSHIP-IND_Individual", lambda x: 1 if sum(x) > 0 else 0),
        OWNERSHIP_IND_Joint = ("OWNERSHIP-IND_Joint", lambda x: 1 if sum(x) > 0 else 0),
        OWNERSHIP_IND_Guarantor = ("OWNERSHIP-IND_Guarantor", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Tractor_Loan = ("ACCT-TYPE_Tractor Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Gold_Loan = ("ACCT-TYPE_Gold Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Business_Loan_Priority_Sector_Agriculture = ("ACCT-TYPE_Business Loan Priority Sector  Agriculture", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Auto_Loan_Personal = ("ACCT-TYPE_Auto Loan (Personal)", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Commercial_Vehicle_Loan = ("ACCT-TYPE_Commercial Vehicle Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Two_Wheeler_Loan = ("ACCT-TYPE_Two-Wheeler Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Consumer_Loan = ("ACCT-TYPE_Consumer Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Personal_Loan = ("ACCT-TYPE_Personal Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Housing_Loan = ("ACCT-TYPE_Housing Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Construction_Equipment_Loan = ("ACCT-TYPE_Construction Equipment Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Business_Loan_General = ("ACCT-TYPE_Business Loan General", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Loan_Against_Bank_Deposits = ("ACCT-TYPE_Loan Against Bank Deposits", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Used_Car_Loan = ("ACCT-TYPE_Used Car Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Business_Loan_Priority_Sector_Small_Business = ("ACCT-TYPE_Business Loan Priority Sector  Small Business", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Loan_Against_Shares_Securities = ("ACCT-TYPE_Loan Against Shares / Securities", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Property_Loan = ("ACCT-TYPE_Property Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Business_Loan_Priority_Sector_Others = ("ACCT-TYPE_Business Loan Priority Sector  Others", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Mudra_Loans = ("ACCT-TYPE_Mudra Loans   Shishu / Kishor / Tarun", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Education_Loan = ("ACCT-TYPE_Education Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Business_Loan_Secured  = ("ACCT-TYPE_Business Loan - Secured", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Business_Loan_Against_Bank_Deposits = ("ACCT-TYPE_Business Loan Against Bank Deposits", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Business_Loan_Unsecured = ("ACCT-TYPE_Business Loan Unsecured", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Microfinance_Business_Loan = ("ACCT-TYPE_Microfinance Business Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Loan_to_Professional = ("ACCT-TYPE_Loan to Professional", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Microfinance_Personal_Loan = ("ACCT-TYPE_Microfinance Personal Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Microfinance_Housing_Loan = ("ACCT-TYPE_Microfinance Housing Loan", lambda x: 1 if sum(x) > 0 else 0),
        ACCT_TYPE_Commercial_Equipment_Loan = ("ACCT-TYPE_Commercial Equipment Loan", lambda x: 1 if sum(x) > 0 else 0),
        #ACCT_TYPE_Loan_on_Credit_Card  = ("ACCT-TYPE_Loan on Credit Card", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_NBF = ("CONTRIBUTOR-TYPE_NBF", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_NAB = ("CONTRIBUTOR-TYPE_NAB", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_PRB = ("CONTRIBUTOR-TYPE_PRB", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_RRB = ("CONTRIBUTOR-TYPE_RRB", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_COP = ("CONTRIBUTOR-TYPE_COP", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_MFI = ("CONTRIBUTOR-TYPE_MFI", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_HFC = ("CONTRIBUTOR-TYPE_HFC", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_CCC = ("CONTRIBUTOR-TYPE_CCC", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_FRB = ("CONTRIBUTOR-TYPE_FRB", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_SFB = ("CONTRIBUTOR-TYPE_SFB", lambda x: 1 if sum(x) > 0 else 0),
        #CONTRIBUTOR_TYPE_ARC = ("CONTRIBUTOR-TYPE_ARC", lambda x: 1 if sum(x) > 0 else 0),
        CONTRIBUTOR_TYPE_OFI = ("CONTRIBUTOR-TYPE_OFI", lambda x: 1 if sum(x) > 0 else 0),
        ACCOUNT_STATUS_Closed = ("ACCOUNT-STATUS_Closed", lambda x: 1 if sum(x) > 0 else 0),
        ACCOUNT_STATUS_Active = ("ACCOUNT-STATUS_Active", lambda x: 1 if sum(x) > 0 else 0),
        ACCOUNT_STATUS_Delinquent = ("ACCOUNT-STATUS_Delinquent", lambda x: 1 if sum(x) > 0 else 0),
        ASSET_CLASS_Doubtful = ("ASSET_CLASS_Doubtful", lambda x: 1 if sum(x) > 0 else 0),
        ASSET_CLASS_Standard = ("ASSET_CLASS_Standard", lambda x: 1 if sum(x) > 0 else 0),
        ASSET_CLASS_SubStandard = ("ASSET_CLASS_SubStandard", lambda x: 1 if sum(x) > 0 else 0),
    )
    return output

### Functions on the first dataset

In [302]:
def prerequisites_for_one_hot_dataset(data):
    # returns the list of some columns
    top_3_frequencies = data["Frequency"].value_counts().sort_values(ascending = False).head(3).index
    data["SEX"] = data["SEX"].fillna("Others")
    top_2_genders = data["SEX"].value_counts().sort_values(ascending = False).head(2).index
    top_14_states = data["State"].value_counts().sort_values(ascending = False).head(14).index
    top_4_payments = data["PaymentMode"].value_counts().sort_values(ascending = False).head(4).index
    return top_3_frequencies, top_2_genders, top_14_states, top_4_payments

In [303]:
def one_hot_for_dataset(data, top_3_frequencies, top_2_genders, top_14_states, top_4_payments):
    one_hot(data,"Frequency",top_3_frequencies)
    one_hot(data,"SEX",top_2_genders)
    one_hot(data,"State",top_14_states)
    one_hot(data,"PaymentMode",top_4_payments)
    return data

### Filling null values

In [304]:
def create_encoders(data):
    installment_encoder = LabelEncoder()
    loan_status_encoder = LabelEncoder()
    age_imputer = SimpleImputer(strategy= "median")
    tenure_imputer = SimpleImputer(strategy= "median")
    income_imputer = SimpleImputer()
    age_imputer.fit(np.array(data['AGE']).reshape(-1,1))
    tenure_imputer.fit(np.array(data['TENURE']).reshape(-1,1))
    income_imputer.fit(np.array(data['MonthlyIncome']).reshape(-1,1))
    installment_encoder.fit(data["InstlmentMode"])
    loan_status_encoder.fit(data["LoanStatus"])
    return age_imputer, tenure_imputer, income_imputer, installment_encoder, loan_status_encoder

In [305]:
def fill_na(data, age_imputer, tenure_imputer, income_imputer, installment_encoder, loan_status_encoder):
    
    # filling null dates
    current_date = dt.datetime.now()
    current_time = current_date.strftime('%Y-%m-%d')
    data["DATE_REPORTED_YEAR"] = pd.to_datetime(data['DATE_REPORTED'], errors='coerce').dt.year
    data["DATE_REPORTED_MONTH"] = pd.to_datetime(data['DATE_REPORTED'], errors='coerce').dt.month
    data["DisbursalDateMonth"] = pd.to_datetime(data['DisbursalDate'], errors='coerce').dt.month
    data["DisbursalDateYear"] = pd.to_datetime(data['DisbursalDate'], errors='coerce').dt.year
    data["DISBURSED_DT_YEAR"] = pd.to_datetime(data['DISBURSED_DT'], errors='coerce').dt.year
    data["DISBURSED_DT_MONTH"] = pd.to_datetime(data['DISBURSED_DT'], errors='coerce').dt.month
    data['MaturityMonth'] = pd.to_datetime(data['MaturityDAte'], errors='coerce').dt.month
    data['MaturityYear'] = pd.to_datetime(data['MaturityDAte'], errors='coerce').dt.year
    data['AuthDateMonth'] = pd.to_datetime(data['AuthDate'], errors='coerce').dt.month
    data['AuthDateYear'] = pd.to_datetime(data['AuthDate'], errors='coerce').dt.year
    data['LAST_PAYMENT_MONTH'] = pd.to_datetime(data['LAST_PAYMENT_DATE'], errors='coerce').dt.month
    data['LAST_PAYMENT_YEAR'] = pd.to_datetime(data['LAST_PAYMENT_DATE'], errors='coerce').dt.year
    
    # filling null zipcodes
    for i,j in data.groupby(["BranchID"]):
        col = max_comp(list(j["ZiPCODE"]))
        a = j.iloc[0]["BranchID"]
        data.loc[(data['ZiPCODE'].isnull() == True) & (data['BranchID'] == a), "ZiPCODE"] = col
    
    # filling using imputers
    data["AGE"] = age_imputer.transform(np.array(data['AGE']).reshape(-1,1))
    data["TENURE"] = tenure_imputer.transform(np.array(data['TENURE']).reshape(-1,1))
    data["MonthlyIncome"] = income_imputer.transform(np.array(data['MonthlyIncome']).reshape(-1,1))
    
    # filling using encoder
    data["InstlmentMode"] = installment_encoder.transform(data["InstlmentMode"])
    data["LoanStatus"] = loan_status_encoder.transform(data["LoanStatus"])
    
    # filling for Manufacturer ID
    col = max_comp(list(data["ManufacturerID"]))
    data["ManufacturerID"] = data["ManufacturerID"].fillna(col)
    return data

### Importing bureau data

In [9]:
input_data_1 = pd.read_excel("Train/train_bureau.xlsx", engine="openpyxl")

In [346]:
data = input_data_1.copy(deep=True)
all_loans, all_contributors, top_3_account_status, top_3_asset_classes, top_3_ownerships = prerequisites_for_one_hot_bureau(data)
bureau_data = preprocessing_bureau(data, all_loans, all_contributors, top_3_account_status, top_3_asset_classes, top_3_ownerships)

'2021-02-07 21:43:45.932849'

### Importing train data

In [11]:
input_data_2 = pd.read_excel("Train/train_Data.xlsx", engine='openpyxl')

In [348]:
data = input_data_2.copy(deep=True)

In [349]:
top_3_frequencies, top_2_genders, top_14_states, top_4_payments = prerequisites_for_one_hot_dataset(data)
data = one_hot_for_dataset(data, top_3_frequencies, top_2_genders, top_14_states, top_4_payments)

### Merging both datasets

In [350]:
train_data = pd.merge(bureau_data, data, on="ID")
age_imputer, tenure_imputer, income_imputer, installment_encoder, loan_status_encoder = create_encoders(train_data)
train_data = fill_na(train_data, age_imputer, tenure_imputer, income_imputer, installment_encoder, loan_status_encoder)

In [352]:
train_ID = train_data["ID"]

In [353]:
y_train = train_data["Top-up Month"]
y_encode = LabelEncoder()
y_train = y_encode.fit_transform(y_train)

In [354]:
train_data.drop(["City", "Area", "ID", "DisbursalDate","DISBURSED_DT",
                "MaturityDAte", "AuthDate","Top-up Month","DATE_REPORTED","LAST_PAYMENT_DATE"
                ], axis=1, inplace=True)

### Working on test data

In [18]:
input_data_3 = pd.read_excel("Test/test_bureau.xlsx", engine="openpyxl")

In [355]:
data = input_data_3.copy(deep=True)
bureau_data = preprocessing_bureau(data, all_loans, all_contributors, top_3_account_status, top_3_asset_classes, top_3_ownerships)

In [335]:
input_data_4 = pd.read_excel("Test/test_Data.xlsx", engine='openpyxl')

In [356]:
data = input_data_4.copy(deep=True)
data = one_hot_for_dataset(data, top_3_frequencies, top_2_genders, top_14_states, top_4_payments)

In [357]:
test_data = pd.merge(bureau_data, data, on="ID")
test_data = fill_na(test_data, age_imputer, tenure_imputer, income_imputer, installment_encoder, loan_status_encoder)

In [358]:
ID = test_data["ID"]
test_data.drop(["City", "Area", "ID", "DisbursalDate","DATE_REPORTED",
                "MaturityDAte", "AuthDate","DISBURSED_DT","LAST_PAYMENT_DATE"
                ], axis=1, inplace=True)

### Working on ML models

#### CatBoost classifier

In [283]:
import catboost as ctb
from sklearn.model_selection import RandomizedSearchCV

In [290]:
random_grid = {'learning_rate': [0.5, 0.60, 0.7],
               'depth': [4,5,6],
               'iterations': [650, 700, 750]}


CBC = RandomizedSearchCV(estimator = ctb.CatBoostClassifier(), param_distributions = random_grid, cv = 3, n_jobs=5, verbose = 2)

CBC.fit(train_data, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed: 16.2min finished


0:	learn: 0.7112971	total: 58.7ms	remaining: 38.1s
1:	learn: 0.6209845	total: 109ms	remaining: 35.3s
2:	learn: 0.5752582	total: 159ms	remaining: 34.4s
3:	learn: 0.5567135	total: 208ms	remaining: 33.6s
4:	learn: 0.5340287	total: 281ms	remaining: 36.2s
5:	learn: 0.5216233	total: 327ms	remaining: 35.1s
6:	learn: 0.5136661	total: 382ms	remaining: 35.1s
7:	learn: 0.5027080	total: 436ms	remaining: 35s
8:	learn: 0.4970222	total: 484ms	remaining: 34.5s
9:	learn: 0.4880148	total: 545ms	remaining: 34.9s
10:	learn: 0.4815030	total: 600ms	remaining: 34.9s
11:	learn: 0.4784503	total: 660ms	remaining: 35.1s
12:	learn: 0.4755548	total: 718ms	remaining: 35.2s
13:	learn: 0.4714715	total: 770ms	remaining: 35s
14:	learn: 0.4678819	total: 826ms	remaining: 35s
15:	learn: 0.4661326	total: 877ms	remaining: 34.8s
16:	learn: 0.4637065	total: 922ms	remaining: 34.3s
17:	learn: 0.4619796	total: 974ms	remaining: 34.2s
18:	learn: 0.4606344	total: 1.03s	remaining: 34.2s
19:	learn: 0.4570758	total: 1.09s	remaining: 3

164:	learn: 0.3977112	total: 8.26s	remaining: 24.3s
165:	learn: 0.3975877	total: 8.31s	remaining: 24.2s
166:	learn: 0.3973795	total: 8.35s	remaining: 24.2s
167:	learn: 0.3972562	total: 8.41s	remaining: 24.1s
168:	learn: 0.3972225	total: 8.45s	remaining: 24.1s
169:	learn: 0.3971430	total: 8.5s	remaining: 24s
170:	learn: 0.3969194	total: 8.55s	remaining: 23.9s
171:	learn: 0.3967613	total: 8.59s	remaining: 23.9s
172:	learn: 0.3966711	total: 8.64s	remaining: 23.8s
173:	learn: 0.3966211	total: 8.68s	remaining: 23.7s
174:	learn: 0.3962329	total: 8.73s	remaining: 23.7s
175:	learn: 0.3960799	total: 8.77s	remaining: 23.6s
176:	learn: 0.3958323	total: 8.83s	remaining: 23.6s
177:	learn: 0.3955956	total: 8.88s	remaining: 23.5s
178:	learn: 0.3954783	total: 8.92s	remaining: 23.5s
179:	learn: 0.3953048	total: 8.97s	remaining: 23.4s
180:	learn: 0.3950458	total: 9.02s	remaining: 23.4s
181:	learn: 0.3947250	total: 9.07s	remaining: 23.3s
182:	learn: 0.3947029	total: 9.11s	remaining: 23.3s
183:	learn: 0.3

325:	learn: 0.3705901	total: 16.3s	remaining: 16.2s
326:	learn: 0.3704630	total: 16.4s	remaining: 16.2s
327:	learn: 0.3702783	total: 16.5s	remaining: 16.2s
328:	learn: 0.3701214	total: 16.5s	remaining: 16.1s
329:	learn: 0.3700644	total: 16.5s	remaining: 16s
330:	learn: 0.3698462	total: 16.6s	remaining: 16s
331:	learn: 0.3697872	total: 16.6s	remaining: 15.9s
332:	learn: 0.3696019	total: 16.7s	remaining: 15.9s
333:	learn: 0.3694463	total: 16.7s	remaining: 15.8s
334:	learn: 0.3694317	total: 16.8s	remaining: 15.8s
335:	learn: 0.3693499	total: 16.8s	remaining: 15.7s
336:	learn: 0.3692268	total: 16.9s	remaining: 15.7s
337:	learn: 0.3690884	total: 16.9s	remaining: 15.6s
338:	learn: 0.3689927	total: 17s	remaining: 15.6s
339:	learn: 0.3689001	total: 17s	remaining: 15.5s
340:	learn: 0.3687944	total: 17.1s	remaining: 15.5s
341:	learn: 0.3686417	total: 17.1s	remaining: 15.4s
342:	learn: 0.3683671	total: 17.2s	remaining: 15.4s
343:	learn: 0.3682358	total: 17.2s	remaining: 15.3s
344:	learn: 0.368103

485:	learn: 0.3493695	total: 24.5s	remaining: 8.25s
486:	learn: 0.3492889	total: 24.5s	remaining: 8.2s
487:	learn: 0.3491993	total: 24.6s	remaining: 8.15s
488:	learn: 0.3491287	total: 24.6s	remaining: 8.1s
489:	learn: 0.3489885	total: 24.6s	remaining: 8.04s
490:	learn: 0.3488902	total: 24.7s	remaining: 7.99s
491:	learn: 0.3487627	total: 24.7s	remaining: 7.94s
492:	learn: 0.3482639	total: 24.8s	remaining: 7.89s
493:	learn: 0.3481137	total: 24.8s	remaining: 7.84s
494:	learn: 0.3479944	total: 24.9s	remaining: 7.79s
495:	learn: 0.3478657	total: 24.9s	remaining: 7.75s
496:	learn: 0.3475890	total: 25s	remaining: 7.7s
497:	learn: 0.3474424	total: 25.1s	remaining: 7.66s
498:	learn: 0.3473326	total: 25.2s	remaining: 7.63s
499:	learn: 0.3473069	total: 25.3s	remaining: 7.58s
500:	learn: 0.3472259	total: 25.4s	remaining: 7.55s
501:	learn: 0.3471287	total: 25.5s	remaining: 7.51s
502:	learn: 0.3469599	total: 25.5s	remaining: 7.46s
503:	learn: 0.3467887	total: 25.6s	remaining: 7.41s
504:	learn: 0.346

644:	learn: 0.3311451	total: 33.8s	remaining: 262ms
645:	learn: 0.3310946	total: 33.8s	remaining: 210ms
646:	learn: 0.3309668	total: 33.9s	remaining: 157ms
647:	learn: 0.3308980	total: 34s	remaining: 105ms
648:	learn: 0.3308556	total: 34.1s	remaining: 52.5ms
649:	learn: 0.3307559	total: 34.1s	remaining: 0us


RandomizedSearchCV(cv=3,
                   estimator=<catboost.core.CatBoostClassifier object at 0x7f747d4ccb20>,
                   n_jobs=5,
                   param_distributions={'depth': [4, 5, 6],
                                        'iterations': [650, 700, 750],
                                        'learning_rate': [0.5, 0.6, 0.7]},
                   verbose=2)

In [293]:
CBC_predict = CBC.predict(test_data)
CBC_predict = y_encode.inverse_transform(CBC_predict)
CBC_predict = pd.DataFrame(CBC_predict, columns=["Top-up Month"], index=ID)
CBC_predict.to_csv("basic_ctbc_predictions.csv")

In [294]:
predict = CBC.predict(train_data)
print(f1_score(y_train, predict, average="macro"))

0.6121477883375656


#### Light GBM

In [360]:
#import lightgbm as lgb

#d_train = lgb.Dataset(train_data, label=y_train)
#params = {}
#params['learning_rate'] = 0.1
#params['boosting_type'] = 'dart'
#params['objective'] = 'multiclass'
#params['metric'] = 'multi_logloss'
#params['max_depth']=7
#params['num_class'] = 7
#params['lambda'] = 0.3
#params['min_data_in_leaf'] = 25
#clf = lgb.train(params, d_train, 800)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4975
[LightGBM] [Info] Number of data points in the train set: 128655, number of used features: 102
[LightGBM] [Info] Start training from score -2.732959
[LightGBM] [Info] Start training from score -4.823700
[LightGBM] [Info] Start training from score -3.995089
[LightGBM] [Info] Start training from score -3.606660
[LightGBM] [Info] Start training from score -3.738066
[LightGBM] [Info] Start training from score -3.560765
[LightGBM] [Info] Start training from score -0.187329


In [361]:
#lgbm_predict = clf.predict(test_data)
#lgbm_predict = np.argmax(lgbm_predict,axis=1)
#lgbm_predict = y_encode.inverse_transform(lgbm_predict)
#lgbm_predict = pd.DataFrame(lgbm_predict, columns=["Top-up Month"], index=ID)
#lgbm_predict.to_csv("basic_lgbm_predictions.csv")

In [362]:
#predict = clf.predict(train_data)
#lgbm_predict = np.argmax(predict,axis=1)

#print(f1_score(y_train, lgbm_predict, average="macro"))

0.7346033018539019
