In [1]:
import pandas as pd 
import os
import numpy as np
from matplotlib import pyplot as plt 

In [2]:
pd.options.display.max_rows = 10
pd.options.display.max_columns = 999

In [3]:
def process_cols(df, cols_dict, sample=None):
    
    if sample is not None:
        n_examples = int(df.shape[0] * sample)
        df = df.sample(n_examples, random_state=42).reset_index(drop=True)
    
    d = {}
    cat_cols = []
    for ind, col in enumerate(df.columns): 
        if col != "target":
            if col in cols_dict["cat_cols"]:
                d[col] = f"cat_{ind}"
                cat_cols.append(f"cat_{ind}")
            else:
                d[col] = f"num_{ind}"
    df = df.rename(d, axis=1)
    for col in cat_cols:
        df[col] = df[col].fillna("new_category_nan")
        df[col] = df[col].astype("str")
    print("Tot cat columns : ", len(cat_cols))
    return df

# telecom

In [4]:
# https://www.kaggle.com/blastchar/telco-customer-churn

In [5]:
dataset_name = "telecom"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}WA_Fn-UseC_-Telco-Customer-Churn.csv")
print(data.shape)
data.head()

(7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
cols_dict = {
    "id_cols": ["customerID"],
    "cat_cols": [
        "gender", "SeniorCitizen", "Partner", "Dependents",
        "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", 
        "TechSupport", "StreamingTV", "StreamingMovies", "Contract",
        "PaperlessBilling", "PaymentMethod"
    ], 
    "time_cols": []
}

In [7]:
data["target"] = data["Churn"] == "Yes"
data.drop(["Churn"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [8]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  16


Unnamed: 0,cat_0,cat_1,cat_2,cat_3,num_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,num_17,num_18,target
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,False
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,False
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,True
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,False
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,True


In [9]:
for col in data.columns:
    print(col, data[col].dtypes)

cat_0 object
cat_1 object
cat_2 object
cat_3 object
num_4 int64
cat_5 object
cat_6 object
cat_7 object
cat_8 object
cat_9 object
cat_10 object
cat_11 object
cat_12 object
cat_13 object
cat_14 object
cat_15 object
cat_16 object
num_17 float64
num_18 object
target bool


In [10]:
data.loc[data['num_18'] == ' ', 'num_18'] = None
data['num_18'] = data['num_18'].astype("float")

In [11]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# mortgages

In [12]:
# https://www.crowdanalytix.com/contests/propensity-to-fund-mortgages

In [13]:
dataset_name = "mortgages"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}CAX_MortgageModeling_Train.csv")
print(data.shape)
data.head()

(45642, 22)


Unnamed: 0,Unique_ID,MORTGAGE NUMBER,PROPERTY VALUE,MORTGAGE PAYMENT,GDS,LTV,TDS,AMORTIZATION,MORTGAGE AMOUNT,RATE,MORTGAGE PURPOSE,PAYMENT FREQUENCY,PROPERTY TYPE,TERM,FSA,AGE RANGE,GENDER,INCOME,INCOME TYPE,NAICS CODE,CREDIT SCORE,RESULT
0,CAX_Train_1,1,900000,5429,61.98,65.0,71.63,360,1040000.0,4.0,Refinance,Monthly,Single Detached,12,L4C,Under 25,Male,108000,8,44-45,681,FUNDED
1,CAX_Train_2,2,386000,2179,35.22,74.29,40.65,360,390000.0,4.5,Refinance,Monthly,Single Detached,12,L9T,70 and over,Male,78000,2,56,710,FUNDED
2,CAX_Train_3,3,531000,2152,30.97,80.0,35.41,360,424800.0,3.5,Purchase,Monthly,Single Detached,6,M1N,35-39,Female,87000,2,72,709,FUNDED
3,CAX_Train_4,4,1200000,5410,19.04,75.0,34.14,360,960000.0,5.5,Purchase,Monthly,Single Detached,12,M2M,45-49,Male,300000,8,54,761,FUNDED
4,CAX_Train_5,5,350000,3342,29.59,80.0,34.85,360,592000.0,5.0,Refinance,Monthly,Single Detached,12,L7G,50-54,Male,147000,8,62,762,FUNDED


In [14]:
cols_dict = {
    "id_cols": ["Unique_ID", "MORTGAGE NUMBER"],
    "cat_cols": [
        "MORTGAGE PURPOSE", "PAYMENT FREQUENCY", "PROPERTY TYPE", "TERM", "FSA", "AGE RANGE", "GENDER", 
        "INCOME TYPE", "NAICS CODE"
    ], 
    "time_cols": []
}

In [15]:
data["target"] = data["RESULT"] == "FUNDED"
data.drop(["RESULT"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [16]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  9


Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,num_15,cat_16,cat_17,num_18,target
0,900000,5429,61.98,65.0,71.63,360,1040000.0,4.0,Refinance,Monthly,Single Detached,12,L4C,Under 25,Male,108000,8,44-45,681,True
1,386000,2179,35.22,74.29,40.65,360,390000.0,4.5,Refinance,Monthly,Single Detached,12,L9T,70 and over,Male,78000,2,56,710,True
2,531000,2152,30.97,80.0,35.41,360,424800.0,3.5,Purchase,Monthly,Single Detached,6,M1N,35-39,Female,87000,2,72,709,True
3,1200000,5410,19.04,75.0,34.14,360,960000.0,5.5,Purchase,Monthly,Single Detached,12,M2M,45-49,Male,300000,8,54,761,True
4,350000,3342,29.59,80.0,34.85,360,592000.0,5.0,Refinance,Monthly,Single Detached,12,L7G,50-54,Male,147000,8,62,762,True


In [17]:
for col in data.columns:
    print(col, data[col].dtypes)

num_0 int64
num_1 int64
num_2 float64
num_3 float64
num_4 float64
num_5 int64
num_6 float64
num_7 float64
cat_8 object
cat_9 object
cat_10 object
cat_11 object
cat_12 object
cat_13 object
cat_14 object
num_15 int64
cat_16 object
cat_17 object
num_18 int64
target bool


In [18]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# Taxi

In [19]:
# https://www.crowdanalytix.com/contests/mckinsey-big-data-hackathon

In [20]:
dataset_name = "taxi"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}CAX_TrainingData_McK.csv")
print(data.shape)
data.head()

(892557, 14)


Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,driver_response
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,-1.0,-1.0,Economy,private,0
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,25.217,Standard,private,1
2,416977,6,14,6080,866260,55.813978,37.347688,55.814827,37.354074,6.747,9.8,Economy,private,0
3,889660,2,6,6080,163522,55.745922,37.421748,55.743469,37.43113,-1.0,-1.0,Economy,private,1
4,1120055,4,16,6080,506710,55.803578,37.521602,55.812559,37.527407,12.383,19.25,Economy,private,1


In [21]:
cols_dict = {
    "id_cols": ["offer_gk","order_gk"],
    "cat_cols": [
        "weekday_key", "hour_key", "driver_gk", "offer_class_group", "ride_type_desc"
    ], 
    "time_cols": ["driver_latitude", "driver_longitude", "origin_order_latitude", "origin_order_longitude"
]
}

In [22]:
data["target"] = data["driver_response"] == 1
data.drop(["driver_response"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [23]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  5


Unnamed: 0,cat_0,cat_1,cat_2,num_3,num_4,cat_5,cat_6,target
0,5,20,6080,-1.0,-1.0,Economy,private,False
1,5,14,6080,18.802,25.217,Standard,private,True
2,6,14,6080,6.747,9.8,Economy,private,False
3,2,6,6080,-1.0,-1.0,Economy,private,True
4,4,16,6080,12.383,19.25,Economy,private,True


In [24]:
for col in data.columns:
    print(col, data[col].dtypes)

cat_0 object
cat_1 object
cat_2 object
num_3 float64
num_4 float64
cat_5 object
cat_6 object
target bool


In [25]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# poverty_A

In [26]:
# https://www.drivendata.org/competitions/50/worldbank-poverty-prediction/data/

In [27]:
dataset_name = "poverty_A"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}A_indiv_train.csv")
print(data.shape)
data.head()

(37560, 44)


Unnamed: 0,id,iid,HeUgMnzF,CaukPfUC,MzEtIdUF,gtnNTNam,SWoXNmPc,eXbOkwhI,OdXpbPGJ,XONDGWjH,KsFoQcUV,qYRZCuJD,FPQrjGnS,hOamrctW,XacGrSou,UsmeXdIS,igHwZsYz,cxWuAOZv,AQpdiRUz,AoLwmlEH,nLUXHpZr,CRLlSiFu,jYpOAjPW,NAxEQZVi,QvgxCmCV,kvMGuSDN,AjYsrafY,GNaMafZC,zTLPJYTw,QKzxTGus,SGeOiUlZ,MUrHEJeh,XBldkztv,tbgZsPXD,ukWqmeSS,qqVibbSA,MgCoFhXK,rFpoTXAq,RXcLsVAQ,rQWIpTiG,XizJGmbu,xqUooaNJ,poor,country
0,80389,1,XJsPz,mOlYV,UFoKR,SSvEP,onRNG,YXCNt,4.0,oArAw,kpkiH,fohru,scxJu,rrHdI,YEngm,pdgUV,YwljV,QkRds,SowpV,XNPgB,CneHb,dpMMl,KOjYm,GIApU,hCKQi,vtkRP,kVYrO,sitaC,VneGw,WNISg,OlSuJ,qmOVd,XQevi,yOwsR,181,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,dSJoN,True,A
1,80389,2,XJsPz,mOlYV,axSTs,CXizI,onRNG,YXCNt,4.0,ccbZA,HgfUG,fohru,scxJu,rrHdI,EaHvf,pdgUV,YwljV,QkRds,SowpV,XNPgB,CneHb,SjaWF,KOjYm,GIApU,hCKQi,Qydia,OLBHI,sitaC,VneGw,EAWFH,OlSuJ,qmOVd,tbsMf,yOwsR,141,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,JTCKs,True,A
2,80389,3,TRFeI,mOlYV,axSTs,CXizI,NDnCs,YXCNt,4.0,fOUHD,HgfUG,oEref,bJTYb,kNZZn,zfTDU,olfwp,flBEG,QkRds,QjTos,vSaJn,CneHb,ndArQ,KOjYm,GIApU,AyuSE,Qydia,OLBHI,sitaC,zncPX,EAWFH,OlSuJ,qmOVd,tbsMf,yOwsR,41,QQdHS,gCSRj,Hikoa,zQvdC,rkLqZ,juMSt,JTCKs,True,A
3,80389,4,XJsPz,yAyAe,FRcdT,CXizI,onRNG,YXCNt,,fOUHD,HgfUG,fohru,bJTYb,rrHdI,VzUws,olfwp,flBEG,QkRds,nUKzL,vSaJn,vvXmD,ndArQ,KOjYm,GIApU,AyuSE,Qydia,OLBHI,sitaC,rXEFU,aKoLM,OlSuJ,mEGPl,tbsMf,yOwsR,16,QQdHS,uEstx,Hikoa,zQvdC,jVHyH,GtHel,JTCKs,True,A
4,39883,1,XJsPz,mOlYV,UFoKR,HIvIU,onRNG,YXCNt,4.0,oArAw,kpkiH,fohru,scxJu,rrHdI,HHynv,IZbuU,YwljV,QkRds,SowpV,OeQKE,CneHb,SjaWF,KOjYm,GIApU,hCKQi,Qydia,sqnlK,sitaC,VneGw,EAWFH,OlSuJ,qmOVd,tbsMf,yOwsR,381,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,UaIsy,False,A


In [28]:
cols_dict = {
    "id_cols": ["id", "iid", "country"],
    "cat_cols": [col for col in data.columns if col not in ["id", "iid", "poor", "ukWqmeSS", "OdXpbPGJ"]],
    "time_cols": []
}

In [29]:
data["target"] = data["poor"] == 1
data.drop(["poor"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [30]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  38


Unnamed: 0,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,num_6,cat_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18,cat_19,cat_20,cat_21,cat_22,cat_23,cat_24,cat_25,cat_26,cat_27,cat_28,cat_29,cat_30,cat_31,num_32,cat_33,cat_34,cat_35,cat_36,cat_37,cat_38,cat_39,target
0,XJsPz,mOlYV,UFoKR,SSvEP,onRNG,YXCNt,4.0,oArAw,kpkiH,fohru,scxJu,rrHdI,YEngm,pdgUV,YwljV,QkRds,SowpV,XNPgB,CneHb,dpMMl,KOjYm,GIApU,hCKQi,vtkRP,kVYrO,sitaC,VneGw,WNISg,OlSuJ,qmOVd,XQevi,yOwsR,181,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,dSJoN,True
1,XJsPz,mOlYV,axSTs,CXizI,onRNG,YXCNt,4.0,ccbZA,HgfUG,fohru,scxJu,rrHdI,EaHvf,pdgUV,YwljV,QkRds,SowpV,XNPgB,CneHb,SjaWF,KOjYm,GIApU,hCKQi,Qydia,OLBHI,sitaC,VneGw,EAWFH,OlSuJ,qmOVd,tbsMf,yOwsR,141,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,JTCKs,True
2,TRFeI,mOlYV,axSTs,CXizI,NDnCs,YXCNt,4.0,fOUHD,HgfUG,oEref,bJTYb,kNZZn,zfTDU,olfwp,flBEG,QkRds,QjTos,vSaJn,CneHb,ndArQ,KOjYm,GIApU,AyuSE,Qydia,OLBHI,sitaC,zncPX,EAWFH,OlSuJ,qmOVd,tbsMf,yOwsR,41,QQdHS,gCSRj,Hikoa,zQvdC,rkLqZ,juMSt,JTCKs,True
3,XJsPz,yAyAe,FRcdT,CXizI,onRNG,YXCNt,,fOUHD,HgfUG,fohru,bJTYb,rrHdI,VzUws,olfwp,flBEG,QkRds,nUKzL,vSaJn,vvXmD,ndArQ,KOjYm,GIApU,AyuSE,Qydia,OLBHI,sitaC,rXEFU,aKoLM,OlSuJ,mEGPl,tbsMf,yOwsR,16,QQdHS,uEstx,Hikoa,zQvdC,jVHyH,GtHel,JTCKs,True
4,XJsPz,mOlYV,UFoKR,HIvIU,onRNG,YXCNt,4.0,oArAw,kpkiH,fohru,scxJu,rrHdI,HHynv,IZbuU,YwljV,QkRds,SowpV,OeQKE,CneHb,SjaWF,KOjYm,GIApU,hCKQi,Qydia,sqnlK,sitaC,VneGw,EAWFH,OlSuJ,qmOVd,tbsMf,yOwsR,381,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,UaIsy,False


In [31]:
for col in data.columns:
    print(col, data[col].dtypes)

cat_0 object
cat_1 object
cat_2 object
cat_3 object
cat_4 object
cat_5 object
num_6 float64
cat_7 object
cat_8 object
cat_9 object
cat_10 object
cat_11 object
cat_12 object
cat_13 object
cat_14 object
cat_15 object
cat_16 object
cat_17 object
cat_18 object
cat_19 object
cat_20 object
cat_21 object
cat_22 object
cat_23 object
cat_24 object
cat_25 object
cat_26 object
cat_27 object
cat_28 object
cat_29 object
cat_30 object
cat_31 object
num_32 int64
cat_33 object
cat_34 object
cat_35 object
cat_36 object
cat_37 object
cat_38 object
cat_39 object
target bool


In [32]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# poverty_B

In [33]:
dataset_name = "poverty_B"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}B_indiv_train.csv")
print(data.shape)
data.head()

(20252, 227)


Unnamed: 0,id,iid,MmKGUOrD,splgBZfe,JTCCBpjX,akGaJaYF,eYoMkEyN,bmhTgzYu,BoxViLPz,GjxOmHgp,qlLzyqpP,mYMBWfIH,SZucUQmM,EfqFGRSF,plDSQDzp,jyZvBZlo,gyxBUEPe,unRAgFtX,lTdxcpBA,yoijDqAm,LkhgHzsx,TJGiunYp,WmKLEUcd,poor,UUYXAsfo,IPBrAaYt,hCgDHGEc,GFXSuQvB,UPtsFglH,HZEsXrsF,kgVoFHRj,pqbNHKuK,gEqzZWXR,UkXhNUcd,hggXZmdr,uyhBLPFL,DYgxQeEi,jfsTwowc,MGfpfHam,hwJpBxAl,BXZfamTt,jkutpUMt,lFkBekuJ,aTEyAdSs,TdfmGIAn,kaVnoUbJ,bngqQTHD,cRQWeYKZ,esHWAAyG,ErqgHOEp,Ftzawcsw,SlqMdbUr,jyLFRgZs,TsredPtf,pFOEMbYu,nOoTxqCQ,xohFWnds,JYKPCImt,wEtnhPQs,LIoTrAzw,IoJgECCi,YwcPaCNf,DtcKwIEv,PgXpVWEU,ENQEzjVj,RFLvRzHv,xhqvDVIs,YhbMdUgd,cPHgLSMk,jBZoYONA,ETgxnJOM,HHbZiXXp,DsHJnzeZ,gKsBCLMY,PwxicsdL,UZLkbGnz,xNhKVNgq,nIgKFReH,FgnMveDS,NdBZKehA,mlyFwTHy,KjMhhJdX,zXonuRgn,MWVlKQoo,mQezyEOr,nPlbGBjS,nmrQVORl,KGtdytLE,DELHTNkd,rZfVzfDA,ZHOSEFIu,ogUWmSXZ,VWMwzteg,HaaXLOfY,NCyUzkfp,nAjgFRel,XZUEzGLM,OFkIQOzS,BqXRUQgi,TZDgOhYY,ywnHMkvq,eKkwQtwX,fwRLKHlN,jUwlkffQ,DAeoUuuN,gsofZtJJ,HQPtUibh,aqYWloSo,sWElQwuC,jzBRbsEG,RUTkmJrl,tQzcGkNE,VszNwfYx,hnwVYJyV,RsMBXisy,MiGIkxpV,xbOnnZEV,cGaHwCJA,dfchpzYE,fwBNLzWK,lLoRmWKX,GhzvKcjl,EpWsJmrF,wpoJgbis,KyTRKtbW,CLTXEwmz,xlJGUBXM,zuvYUDSL,ozvFfhhM,ljaNZowA,sbjoLEKh,pyvZUOyi,FldbnriM,FyXQULwx,WqEZQuJP,UUYYcars,xMysunyA,FOIfEcny,BhPvRtpj,JniMXoSr,AFSIVjPG,lnMlvpes,HrXGKbwr,NUefhhHz,UaUnOdpg,QIQcRKCg,dnmwvCng,yNHqwYaJ,rWlPNhUy,QcuPNLAv,nkxrhykC,zTEEwTzG,NMsLrKWd,iHCKmQSy,DSttkpSI,UPswDJlm,OhKTEjVy,jRXAvmTH,FHoVOXib,sIiSADFG,OATvAQNG,uDmhgsaQ,teSzyTms,hdDTwJhQ,FsUFnmgF,uDVGRBVU,WCMouTCp,VKUGzaxx,tkFVflRJ,qhffLQpJ,dFaxStnz,EzGVhUlW,pDeMFniB,AxEjAWSE,MhvfpnqH,jdYXWZgD,AJgudnHB,iZhWxnWa,LLZtRxnK,CtIAiLwz,NPNqQdIn,qzOZZsuS,fyfDnyQk,vquBDhol,MMWrfMBi,GshWgURK,LSpCPhcU,pHYVhLJD,NocVJZYG,zhbhQewN,wJthinfa,qxVAoxyX,sWADnjxA,zgJdRdgg,rxwugNti,XIHNcHXI,tugScLvF,xTZOLcSX,nxAFXxLQ,eWNUvdjK,mAeaImix,yxQmKfsS,PyTDmsQl,CnQHDRVZ,VxXEkbdS,vVyGhGoB,qRplnnEM,SCLDdPrm,HcutWGCs,GlkLFREL,NAZxDkiV,yjAfWuLb,HZqPmvkr,GlktDEEv,IIvMcTrH,JbDBFPPd,clCRDApV,aYGuxgyP,NZYkmhkD,fxWioPPP,ulQCDoYe,tzYvQeOb,DWmTWcUm,PxgyaWYq,NfpXxGQk,cavdrXpj,country
0,62801,1,ZZKZW,CLRvF,QEcpz,wmLgk,VsLed,LRmij,,rLPoG,,wBmmA,SYvDi,ExcCa,fzxDF,nTjeS,tEehU,,AwAZH,CJciR,gouHj,,,False,WuYiW,DMMRj,XYMAP,BAepu,ZKHtO,jdddH,MwltS,fWeeW,mUfCv,QhCVe,vteNx,cBaJI,,,-20057.0,jpGgs,OMAba,ZwKYC,RljiF,JqFXv,cPQsP,qXssi,zzQiQ,vGLhB,-1.0,lGbPx,wnWvh,jnMFm,SCNcV,uujhU,IUOva,pyjch,sbRVc,xSJVZ,VprmC,zDRYd,yAfaw,IoMyQ,,lOoVM,AJXyE,gcgvz,aIbya,VMwUL,DHzXF,Aontx,,pygde,xdvtE,-109,scpMR,gLhRL,nRABU,aHxXb,YCDxr,GsGPK,zMlZf,oJJFE,uGmbE,NVWEr,fHGmP,CiPSf,bZaYr,PaSty,ExaxN,sItvx,wrELJ,IUoqV,gxFBa,ENXfH,aMDvF,cCsfg,jmbmU,EFgSK,fvRSg,1052.0,dVQbr,vijmq,ICjTy,ujKUp,kOlhN,zSWWI,UUiGC,MRHGy,,,Urxue,NAvSQ,LwaMz,Baacf,NgOVA,xxPXE,IfRPA,nMWJh,NSfTc,yUuwa,IOBmx,UcqME,KToyu,FxHQQ,ewPOf,,NWLcI,MWWYS,GIMJt,dCjbC,lLRPM,CgQye,kBfAd,OBaph,-2006.0,iKuWQ,ssoMZ,ETNhF,ugfFd,mPuLc,zxuaL,wzOxM,Wsdvj,iNYVL,TbWBG,vxEOa,1,uadNh,GGuOF,ZmwnX,xzhZC,VloRD,qASvW,GriJs,,eKCJh,puFAh,aLYmL,eztBC,,dcLfg,49.0,rrRFz,,LpWKt,utTVH,UVpbm,jUoJv,BplVZ,YGwec,MGxdE,cbuDg,JMirk,rOmBS,tjJhO,xinaM,,,EHjSq,WGkAx,YopdI,yfAST,,elRKB,nHeNd,ILNCl,haUyq,bTxAJ,ZujmJ,qVMHa,-4,QvEVs,GPQFq,dCpjP,IqJgG,HyDNL,kJcMb,BJIIK,,DTzrG,-8.0,bfgeg,wIdgm,lUoRY,ahACm,sOBnN,bywyW,KhlzK,TdcoU,Bovxn,HzgoY,hEagh,26.0,JehJJ,JCGsD,VBmxq,yWVRJ,rlTrR,uCOQO,UYIFp,9,,MAFfK,VnOFM,-7827.0,uJXdA,B
1,62801,2,ZZKZW,CLRvF,QEcpz,wmLgk,VsLed,lfpaq,,rLPoG,,wBmmA,SYvDi,EYUhv,fzxDF,nTjeS,tEehU,,AwAZH,pVzHd,gouHj,,,False,WuYiW,DMMRj,uBqJD,BAepu,ZKHtO,jdddH,MwltS,mJIJq,mUfCv,QhCVe,vteNx,lczKW,,,,jpGgs,OMAba,ZwKYC,RljiF,bXLdG,cPQsP,qXssi,zzQiQ,vGLhB,-1.0,lGbPx,wnWvh,jnMFm,SCNcV,JQNZD,IUOva,pyjch,sbRVc,xSJVZ,VprmC,zDRYd,yAfaw,SflVy,,lOoVM,AJXyE,gcgvz,aIbya,VMwUL,QCFuV,Aontx,,LvUAW,xdvtE,-9,YEKGi,DDjIC,nRABU,aHxXb,YCDxr,GsGPK,zMlZf,oJJFE,uGmbE,NVWEr,fHGmP,CiPSf,bZaYr,XQuSp,ExaxN,sItvx,wrELJ,IUoqV,gxFBa,ENXfH,aMDvF,WomgD,jmbmU,EFgSK,fvRSg,908.0,dVQbr,vijmq,ICjTy,ujKUp,kOlhN,zSWWI,UUiGC,MRHGy,,,Urxue,NAvSQ,LwaMz,Baacf,NgOVA,GwnPj,IfRPA,nMWJh,NSfTc,FzIHG,ijEHl,sqGjf,KToyu,FxHQQ,ewPOf,,ColQA,MWWYS,GIMJt,dCjbC,lLRPM,CgQye,kBfAd,OBaph,-1983.0,iKuWQ,ssoMZ,WRMpA,ugfFd,mPuLc,zxuaL,MkimP,Wsdvj,iNYVL,TbWBG,kVFfF,1,uadNh,kbAmh,ZmwnX,xzhZC,VloRD,qASvW,GriJs,,eKCJh,puFAh,bcpJn,eztBC,,dcLfg,,yhGmw,,LpWKt,utTVH,UVpbm,JqRWC,BplVZ,YGwec,MGxdE,cbuDg,JMirk,rOmBS,JAnza,xinaM,,,EHjSq,WGkAx,YopdI,yfAST,,elRKB,nHeNd,ILNCl,ojvZG,bTxAJ,ZujmJ,qVMHa,-4,QvEVs,GPQFq,dCpjP,IqJgG,HyDNL,kJcMb,BJIIK,,DTzrG,-8.0,bfgeg,wIdgm,lUoRY,ahACm,sOBnN,bywyW,KhlzK,TdcoU,Bovxn,HzgoY,hEagh,,JehJJ,JCGsD,VBmxq,yWVRJ,rlTrR,uCOQO,UYIFp,29,,MAFfK,VnOFM,,uJXdA,B
2,62801,3,ZZKZW,CLRvF,QEcpz,wmLgk,cRkfb,LRmij,-68.0,rLPoG,,mhxNR,SYvDi,EYUhv,fzxDF,nTjeS,ZIcaB,,AwAZH,pVzHd,gouHj,0.0,,False,kxGOb,DMMRj,sTSWL,BAepu,ZKHtO,jdddH,MwltS,mJIJq,mUfCv,QhCVe,vteNx,lczKW,,,,jpGgs,OMAba,ZwKYC,KhKtT,bXLdG,cPQsP,qXssi,zzQiQ,vGLhB,-1.0,lGbPx,wnWvh,jnMFm,SCNcV,JQNZD,IUOva,fKLvO,sbRVc,xSJVZ,LaZkH,smyLf,zSdpY,SflVy,,onbCV,AJXyE,gcgvz,aIbya,KkNYn,QCFuV,hzjkK,,LvUAW,xdvtE,-9,YEKGi,DDjIC,nRABU,aHxXb,YCDxr,GsGPK,zMlZf,oJJFE,PysZH,NVWEr,fHGmP,SjPYj,bZaYr,EBoZt,ExaxN,sItvx,wrELJ,IUoqV,gxFBa,ENXfH,gjpGX,WomgD,jmbmU,EFgSK,fvRSg,1040.0,dVQbr,vijmq,ICjTy,ujKUp,kOlhN,zSWWI,UUiGC,MRHGy,,-4.0,Urxue,NAvSQ,wSDUU,Baacf,NgOVA,hwbar,IfRPA,nMWJh,NSfTc,FzIHG,ijEHl,sqGjf,KToyu,FxHQQ,ewPOf,2.0,CcJnP,JrCwM,GIMJt,dCjbC,lLRPM,CgQye,kBfAd,OBaph,,ropJW,ssoMZ,WRMpA,ugfFd,mPuLc,zxuaL,MkimP,Wsdvj,iNYVL,TbWBG,kVFfF,1,uadNh,kbAmh,ZmwnX,xzhZC,VloRD,qASvW,GriJs,-3.0,eKCJh,puFAh,noMvY,eztBC,,dcLfg,,yhGmw,,tLaqd,utTVH,UVpbm,JqRWC,vAVVy,YGwec,MGxdE,cbuDg,JMirk,rOmBS,JAnza,yVVfU,,,EHjSq,WGkAx,YopdI,yfAST,,elRKB,nHeNd,ILNCl,ojvZG,bTxAJ,qjuXN,qVMHa,-4,QvEVs,LRMxq,dCpjP,IqJgG,HyDNL,kJcMb,BJIIK,,DTzrG,-8.0,bfgeg,NMtVW,lUoRY,EyqjN,sOBnN,IwXNs,qmOxG,aDlJD,kIJMX,HzgoY,hEagh,,JehJJ,hdaYV,VBmxq,yWVRJ,rlTrR,uCOQO,UYIFp,-82,,MAFfK,VnOFM,,uJXdA,B
3,20689,1,ZZKZW,CLRvF,QEcpz,wmLgk,VsLed,LRmij,,rLPoG,,wBmmA,SYvDi,qNDlo,fzxDF,nTjeS,tEehU,,AwAZH,pVzHd,gouHj,,,True,WuYiW,DMMRj,XYMAP,BAepu,ZKHtO,wQNFL,MwltS,mJIJq,mUfCv,QhCVe,vteNx,lczKW,,,,jpGgs,OMAba,ZwKYC,RljiF,bXLdG,cPQsP,qXssi,zzQiQ,vGLhB,-8.0,lGbPx,wnWvh,jnMFm,SCNcV,JQNZD,IUOva,pyjch,sbRVc,xSJVZ,VprmC,zDRYd,yAfaw,SflVy,,lOoVM,AJXyE,gcgvz,aIbya,VMwUL,QCFuV,Aontx,,LvUAW,xdvtE,-119,YEKGi,DDjIC,nRABU,aHxXb,YCDxr,GsGPK,zMlZf,oJJFE,uGmbE,NVWEr,fHGmP,CiPSf,bZaYr,PaSty,ExaxN,sItvx,wrELJ,IUoqV,gxFBa,ENXfH,aMDvF,WomgD,jmbmU,EFgSK,UaNRT,998.0,dVQbr,vijmq,ICjTy,ujKUp,kOlhN,zSWWI,UUiGC,MRHGy,,,Urxue,NAvSQ,LwaMz,Baacf,NgOVA,xxPXE,IfRPA,nMWJh,NSfTc,FzIHG,ijEHl,sqGjf,KToyu,FxHQQ,ewPOf,,rcvDK,MWWYS,GIMJt,dCjbC,lLRPM,CgQye,kIHlc,OBaph,-1979.0,iKuWQ,ssoMZ,WRMpA,ugfFd,mPuLc,zxuaL,MkimP,Wsdvj,iNYVL,TbWBG,muyFb,5,uadNh,kbAmh,ZmwnX,xzhZC,VloRD,qASvW,GriJs,,eKCJh,puFAh,bcpJn,Bjenx,,dcLfg,,yhGmw,,LpWKt,utTVH,Ujfiw,JqRWC,BplVZ,YGwec,MGxdE,cbuDg,JMirk,rOmBS,JAnza,xinaM,,-15833.0,EHjSq,WGkAx,YopdI,yfAST,,elRKB,Sypvt,ILNCl,ojvZG,bTxAJ,ZujmJ,qVMHa,3,QvEVs,GPQFq,dCpjP,IqJgG,HyDNL,kJcMb,BJIIK,,DTzrG,-8.0,bfgeg,wIdgm,lUoRY,ahACm,sOBnN,bywyW,KhlzK,TdcoU,Bovxn,HzgoY,hEagh,,JehJJ,JCGsD,VBmxq,yWVRJ,rlTrR,uCOQO,UYIFp,-6,,MAFfK,ppEcI,-7867.0,uJXdA,B
4,20689,2,ZZKZW,CLRvF,QEcpz,wmLgk,VsLed,lfpaq,,rLPoG,-9.0,wBmmA,SYvDi,EYUhv,QfwOP,nTjeS,ZIcaB,-32.0,AwAZH,pVzHd,gouHj,,,True,kxGOb,DMMRj,sTSWL,BAepu,ZKHtO,GVHhH,MwltS,mJIJq,mUfCv,QhCVe,vteNx,lczKW,,,,jpGgs,OMAba,jItWj,KhKtT,bXLdG,cPQsP,qXssi,YNWtv,vGLhB,-8.0,lGbPx,zgniP,jnMFm,SCNcV,JQNZD,IUOva,pyjch,sbRVc,xSJVZ,VprmC,YcIvR,oejpw,SflVy,8.0,JHnUf,AJXyE,gcgvz,aIbya,VMwUL,QCFuV,IOvtf,,LvUAW,xdvtE,-9,YEKGi,DDjIC,nRABU,einSo,YCDxr,GsGPK,zMlZf,oJJFE,uGmbE,NVWEr,fHGmP,SjPYj,TDgoQ,PaSty,ExaxN,sItvx,wrELJ,mwvLo,gxFBa,ENXfH,aMDvF,WomgD,jmbmU,EFgSK,fkdNn,968.0,dVQbr,vijmq,ICjTy,ujKUp,kOlhN,lVZOx,UUiGC,MRHGy,-24.0,,pIPMN,NAvSQ,wSDUU,Baacf,NgOVA,GwnPj,IfRPA,nMWJh,NSfTc,FzIHG,ijEHl,sqGjf,KToyu,FxHQQ,ewPOf,,rcvDK,MWWYS,GIMJt,dCjbC,lLRPM,CgQye,kIHlc,OBaph,-1974.0,iKuWQ,ssoMZ,WRMpA,ugfFd,mPuLc,zxuaL,MkimP,Wsdvj,iNYVL,TbWBG,kVFfF,5,uadNh,kbAmh,ZmwnX,xzhZC,VloRD,qASvW,GriJs,,sZics,puFAh,bcpJn,Bjenx,,dcLfg,,yhGmw,,oRzdr,bDMtf,UVpbm,JqRWC,BplVZ,wcjbH,MGxdE,cbuDg,JMirk,rOmBS,JAnza,yVVfU,,-15833.0,EHjSq,WGkAx,YopdI,yfAST,4.0,elRKB,Sypvt,Ucdwk,ojvZG,bTxAJ,ZujmJ,qVMHa,3,QvEVs,GPQFq,dCpjP,IqJgG,HyDNL,kJcMb,BJIIK,,DTzrG,-8.0,bfgeg,wIdgm,lUoRY,EyqjN,sOBnN,IwXNs,qmOxG,TdcoU,kIJMX,ASpbn,hEagh,,JehJJ,hdaYV,VBmxq,yWVRJ,rlTrR,uCOQO,UYIFp,19,,MAFfK,ppEcI,-7987.0,uJXdA,B


In [34]:
num_cols = [
    'BoxViLPz', 'qlLzyqpP', 'unRAgFtX', 'TJGiunYp', 'WmKLEUcd',
    'DYgxQeEi', 'jfsTwowc', 'MGfpfHam', 'esHWAAyG', 'DtcKwIEv',
    'ETgxnJOM', 'gKsBCLMY', 'TZDgOhYY', 'sWElQwuC', 'jzBRbsEG', 
    'CLTXEwmz', 'WqEZQuJP', 'dnmwvCng', 'DSttkpSI', 'sIiSADFG', 
    'uDmhgsaQ', 'hdDTwJhQ', 'AJgudnHB', 'iZhWxnWa', 'fyfDnyQk', 
    'wJthinfa', 'nxAFXxLQ', 'mAeaImix', 'HZqPmvkr', 'ulQCDoYe', 
    'tzYvQeOb', 'NfpXxGQk'
]

cols_dict = {
    "id_cols": ["id", "iid", "country"],
    "cat_cols": [col for col in data.columns if col not in num_cols],
    "time_cols": []
}

In [35]:
data["target"] = data["poor"] == 1
data.drop(["poor"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [36]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  191


Unnamed: 0,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,num_6,cat_7,num_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,num_15,cat_16,cat_17,cat_18,num_19,num_20,cat_21,cat_22,cat_23,cat_24,cat_25,cat_26,cat_27,cat_28,cat_29,cat_30,cat_31,cat_32,num_33,num_34,num_35,cat_36,cat_37,cat_38,cat_39,cat_40,cat_41,cat_42,cat_43,cat_44,num_45,cat_46,cat_47,cat_48,cat_49,cat_50,cat_51,cat_52,cat_53,cat_54,cat_55,cat_56,cat_57,cat_58,num_59,cat_60,cat_61,cat_62,cat_63,cat_64,cat_65,cat_66,num_67,cat_68,cat_69,num_70,cat_71,cat_72,cat_73,cat_74,cat_75,cat_76,cat_77,cat_78,cat_79,cat_80,cat_81,cat_82,cat_83,cat_84,cat_85,cat_86,cat_87,cat_88,cat_89,cat_90,cat_91,cat_92,cat_93,cat_94,cat_95,num_96,cat_97,cat_98,cat_99,cat_100,cat_101,cat_102,cat_103,cat_104,num_105,num_106,cat_107,cat_108,cat_109,cat_110,cat_111,cat_112,cat_113,cat_114,cat_115,cat_116,cat_117,cat_118,cat_119,cat_120,cat_121,num_122,cat_123,cat_124,cat_125,cat_126,cat_127,cat_128,cat_129,cat_130,num_131,cat_132,cat_133,cat_134,cat_135,cat_136,cat_137,cat_138,cat_139,cat_140,cat_141,cat_142,num_143,cat_144,cat_145,cat_146,cat_147,cat_148,cat_149,cat_150,num_151,cat_152,cat_153,cat_154,cat_155,num_156,cat_157,num_158,cat_159,num_160,cat_161,cat_162,cat_163,cat_164,cat_165,cat_166,cat_167,cat_168,cat_169,cat_170,cat_171,cat_172,num_173,num_174,cat_175,cat_176,cat_177,cat_178,num_179,cat_180,cat_181,cat_182,cat_183,cat_184,cat_185,cat_186,num_187,cat_188,cat_189,cat_190,cat_191,cat_192,cat_193,cat_194,num_195,cat_196,num_197,cat_198,cat_199,cat_200,cat_201,cat_202,cat_203,cat_204,cat_205,cat_206,cat_207,cat_208,num_209,cat_210,cat_211,cat_212,cat_213,cat_214,cat_215,cat_216,num_217,num_218,cat_219,cat_220,num_221,cat_222,target
0,ZZKZW,CLRvF,QEcpz,wmLgk,VsLed,LRmij,,rLPoG,,wBmmA,SYvDi,ExcCa,fzxDF,nTjeS,tEehU,,AwAZH,CJciR,gouHj,,,WuYiW,DMMRj,XYMAP,BAepu,ZKHtO,jdddH,MwltS,fWeeW,mUfCv,QhCVe,vteNx,cBaJI,,,-20057.0,jpGgs,OMAba,ZwKYC,RljiF,JqFXv,cPQsP,qXssi,zzQiQ,vGLhB,-1.0,lGbPx,wnWvh,jnMFm,SCNcV,uujhU,IUOva,pyjch,sbRVc,xSJVZ,VprmC,zDRYd,yAfaw,IoMyQ,,lOoVM,AJXyE,gcgvz,aIbya,VMwUL,DHzXF,Aontx,,pygde,xdvtE,-109,scpMR,gLhRL,nRABU,aHxXb,YCDxr,GsGPK,zMlZf,oJJFE,uGmbE,NVWEr,fHGmP,CiPSf,bZaYr,PaSty,ExaxN,sItvx,wrELJ,IUoqV,gxFBa,ENXfH,aMDvF,cCsfg,jmbmU,EFgSK,fvRSg,1052.0,dVQbr,vijmq,ICjTy,ujKUp,kOlhN,zSWWI,UUiGC,MRHGy,,,Urxue,NAvSQ,LwaMz,Baacf,NgOVA,xxPXE,IfRPA,nMWJh,NSfTc,yUuwa,IOBmx,UcqME,KToyu,FxHQQ,ewPOf,,NWLcI,MWWYS,GIMJt,dCjbC,lLRPM,CgQye,kBfAd,OBaph,-2006.0,iKuWQ,ssoMZ,ETNhF,ugfFd,mPuLc,zxuaL,wzOxM,Wsdvj,iNYVL,TbWBG,vxEOa,1,uadNh,GGuOF,ZmwnX,xzhZC,VloRD,qASvW,GriJs,,eKCJh,puFAh,aLYmL,eztBC,,dcLfg,49.0,rrRFz,,LpWKt,utTVH,UVpbm,jUoJv,BplVZ,YGwec,MGxdE,cbuDg,JMirk,rOmBS,tjJhO,xinaM,,,EHjSq,WGkAx,YopdI,yfAST,,elRKB,nHeNd,ILNCl,haUyq,bTxAJ,ZujmJ,qVMHa,-4,QvEVs,GPQFq,dCpjP,IqJgG,HyDNL,kJcMb,BJIIK,,DTzrG,-8.0,bfgeg,wIdgm,lUoRY,ahACm,sOBnN,bywyW,KhlzK,TdcoU,Bovxn,HzgoY,hEagh,26.0,JehJJ,JCGsD,VBmxq,yWVRJ,rlTrR,uCOQO,UYIFp,9,,MAFfK,VnOFM,-7827.0,uJXdA,False
1,ZZKZW,CLRvF,QEcpz,wmLgk,VsLed,lfpaq,,rLPoG,,wBmmA,SYvDi,EYUhv,fzxDF,nTjeS,tEehU,,AwAZH,pVzHd,gouHj,,,WuYiW,DMMRj,uBqJD,BAepu,ZKHtO,jdddH,MwltS,mJIJq,mUfCv,QhCVe,vteNx,lczKW,,,,jpGgs,OMAba,ZwKYC,RljiF,bXLdG,cPQsP,qXssi,zzQiQ,vGLhB,-1.0,lGbPx,wnWvh,jnMFm,SCNcV,JQNZD,IUOva,pyjch,sbRVc,xSJVZ,VprmC,zDRYd,yAfaw,SflVy,,lOoVM,AJXyE,gcgvz,aIbya,VMwUL,QCFuV,Aontx,,LvUAW,xdvtE,-9,YEKGi,DDjIC,nRABU,aHxXb,YCDxr,GsGPK,zMlZf,oJJFE,uGmbE,NVWEr,fHGmP,CiPSf,bZaYr,XQuSp,ExaxN,sItvx,wrELJ,IUoqV,gxFBa,ENXfH,aMDvF,WomgD,jmbmU,EFgSK,fvRSg,908.0,dVQbr,vijmq,ICjTy,ujKUp,kOlhN,zSWWI,UUiGC,MRHGy,,,Urxue,NAvSQ,LwaMz,Baacf,NgOVA,GwnPj,IfRPA,nMWJh,NSfTc,FzIHG,ijEHl,sqGjf,KToyu,FxHQQ,ewPOf,,ColQA,MWWYS,GIMJt,dCjbC,lLRPM,CgQye,kBfAd,OBaph,-1983.0,iKuWQ,ssoMZ,WRMpA,ugfFd,mPuLc,zxuaL,MkimP,Wsdvj,iNYVL,TbWBG,kVFfF,1,uadNh,kbAmh,ZmwnX,xzhZC,VloRD,qASvW,GriJs,,eKCJh,puFAh,bcpJn,eztBC,,dcLfg,,yhGmw,,LpWKt,utTVH,UVpbm,JqRWC,BplVZ,YGwec,MGxdE,cbuDg,JMirk,rOmBS,JAnza,xinaM,,,EHjSq,WGkAx,YopdI,yfAST,,elRKB,nHeNd,ILNCl,ojvZG,bTxAJ,ZujmJ,qVMHa,-4,QvEVs,GPQFq,dCpjP,IqJgG,HyDNL,kJcMb,BJIIK,,DTzrG,-8.0,bfgeg,wIdgm,lUoRY,ahACm,sOBnN,bywyW,KhlzK,TdcoU,Bovxn,HzgoY,hEagh,,JehJJ,JCGsD,VBmxq,yWVRJ,rlTrR,uCOQO,UYIFp,29,,MAFfK,VnOFM,,uJXdA,False
2,ZZKZW,CLRvF,QEcpz,wmLgk,cRkfb,LRmij,-68.0,rLPoG,,mhxNR,SYvDi,EYUhv,fzxDF,nTjeS,ZIcaB,,AwAZH,pVzHd,gouHj,0.0,,kxGOb,DMMRj,sTSWL,BAepu,ZKHtO,jdddH,MwltS,mJIJq,mUfCv,QhCVe,vteNx,lczKW,,,,jpGgs,OMAba,ZwKYC,KhKtT,bXLdG,cPQsP,qXssi,zzQiQ,vGLhB,-1.0,lGbPx,wnWvh,jnMFm,SCNcV,JQNZD,IUOva,fKLvO,sbRVc,xSJVZ,LaZkH,smyLf,zSdpY,SflVy,,onbCV,AJXyE,gcgvz,aIbya,KkNYn,QCFuV,hzjkK,,LvUAW,xdvtE,-9,YEKGi,DDjIC,nRABU,aHxXb,YCDxr,GsGPK,zMlZf,oJJFE,PysZH,NVWEr,fHGmP,SjPYj,bZaYr,EBoZt,ExaxN,sItvx,wrELJ,IUoqV,gxFBa,ENXfH,gjpGX,WomgD,jmbmU,EFgSK,fvRSg,1040.0,dVQbr,vijmq,ICjTy,ujKUp,kOlhN,zSWWI,UUiGC,MRHGy,,-4.0,Urxue,NAvSQ,wSDUU,Baacf,NgOVA,hwbar,IfRPA,nMWJh,NSfTc,FzIHG,ijEHl,sqGjf,KToyu,FxHQQ,ewPOf,2.0,CcJnP,JrCwM,GIMJt,dCjbC,lLRPM,CgQye,kBfAd,OBaph,,ropJW,ssoMZ,WRMpA,ugfFd,mPuLc,zxuaL,MkimP,Wsdvj,iNYVL,TbWBG,kVFfF,1,uadNh,kbAmh,ZmwnX,xzhZC,VloRD,qASvW,GriJs,-3.0,eKCJh,puFAh,noMvY,eztBC,,dcLfg,,yhGmw,,tLaqd,utTVH,UVpbm,JqRWC,vAVVy,YGwec,MGxdE,cbuDg,JMirk,rOmBS,JAnza,yVVfU,,,EHjSq,WGkAx,YopdI,yfAST,,elRKB,nHeNd,ILNCl,ojvZG,bTxAJ,qjuXN,qVMHa,-4,QvEVs,LRMxq,dCpjP,IqJgG,HyDNL,kJcMb,BJIIK,,DTzrG,-8.0,bfgeg,NMtVW,lUoRY,EyqjN,sOBnN,IwXNs,qmOxG,aDlJD,kIJMX,HzgoY,hEagh,,JehJJ,hdaYV,VBmxq,yWVRJ,rlTrR,uCOQO,UYIFp,-82,,MAFfK,VnOFM,,uJXdA,False
3,ZZKZW,CLRvF,QEcpz,wmLgk,VsLed,LRmij,,rLPoG,,wBmmA,SYvDi,qNDlo,fzxDF,nTjeS,tEehU,,AwAZH,pVzHd,gouHj,,,WuYiW,DMMRj,XYMAP,BAepu,ZKHtO,wQNFL,MwltS,mJIJq,mUfCv,QhCVe,vteNx,lczKW,,,,jpGgs,OMAba,ZwKYC,RljiF,bXLdG,cPQsP,qXssi,zzQiQ,vGLhB,-8.0,lGbPx,wnWvh,jnMFm,SCNcV,JQNZD,IUOva,pyjch,sbRVc,xSJVZ,VprmC,zDRYd,yAfaw,SflVy,,lOoVM,AJXyE,gcgvz,aIbya,VMwUL,QCFuV,Aontx,,LvUAW,xdvtE,-119,YEKGi,DDjIC,nRABU,aHxXb,YCDxr,GsGPK,zMlZf,oJJFE,uGmbE,NVWEr,fHGmP,CiPSf,bZaYr,PaSty,ExaxN,sItvx,wrELJ,IUoqV,gxFBa,ENXfH,aMDvF,WomgD,jmbmU,EFgSK,UaNRT,998.0,dVQbr,vijmq,ICjTy,ujKUp,kOlhN,zSWWI,UUiGC,MRHGy,,,Urxue,NAvSQ,LwaMz,Baacf,NgOVA,xxPXE,IfRPA,nMWJh,NSfTc,FzIHG,ijEHl,sqGjf,KToyu,FxHQQ,ewPOf,,rcvDK,MWWYS,GIMJt,dCjbC,lLRPM,CgQye,kIHlc,OBaph,-1979.0,iKuWQ,ssoMZ,WRMpA,ugfFd,mPuLc,zxuaL,MkimP,Wsdvj,iNYVL,TbWBG,muyFb,5,uadNh,kbAmh,ZmwnX,xzhZC,VloRD,qASvW,GriJs,,eKCJh,puFAh,bcpJn,Bjenx,,dcLfg,,yhGmw,,LpWKt,utTVH,Ujfiw,JqRWC,BplVZ,YGwec,MGxdE,cbuDg,JMirk,rOmBS,JAnza,xinaM,,-15833.0,EHjSq,WGkAx,YopdI,yfAST,,elRKB,Sypvt,ILNCl,ojvZG,bTxAJ,ZujmJ,qVMHa,3,QvEVs,GPQFq,dCpjP,IqJgG,HyDNL,kJcMb,BJIIK,,DTzrG,-8.0,bfgeg,wIdgm,lUoRY,ahACm,sOBnN,bywyW,KhlzK,TdcoU,Bovxn,HzgoY,hEagh,,JehJJ,JCGsD,VBmxq,yWVRJ,rlTrR,uCOQO,UYIFp,-6,,MAFfK,ppEcI,-7867.0,uJXdA,True
4,ZZKZW,CLRvF,QEcpz,wmLgk,VsLed,lfpaq,,rLPoG,-9.0,wBmmA,SYvDi,EYUhv,QfwOP,nTjeS,ZIcaB,-32.0,AwAZH,pVzHd,gouHj,,,kxGOb,DMMRj,sTSWL,BAepu,ZKHtO,GVHhH,MwltS,mJIJq,mUfCv,QhCVe,vteNx,lczKW,,,,jpGgs,OMAba,jItWj,KhKtT,bXLdG,cPQsP,qXssi,YNWtv,vGLhB,-8.0,lGbPx,zgniP,jnMFm,SCNcV,JQNZD,IUOva,pyjch,sbRVc,xSJVZ,VprmC,YcIvR,oejpw,SflVy,8.0,JHnUf,AJXyE,gcgvz,aIbya,VMwUL,QCFuV,IOvtf,,LvUAW,xdvtE,-9,YEKGi,DDjIC,nRABU,einSo,YCDxr,GsGPK,zMlZf,oJJFE,uGmbE,NVWEr,fHGmP,SjPYj,TDgoQ,PaSty,ExaxN,sItvx,wrELJ,mwvLo,gxFBa,ENXfH,aMDvF,WomgD,jmbmU,EFgSK,fkdNn,968.0,dVQbr,vijmq,ICjTy,ujKUp,kOlhN,lVZOx,UUiGC,MRHGy,-24.0,,pIPMN,NAvSQ,wSDUU,Baacf,NgOVA,GwnPj,IfRPA,nMWJh,NSfTc,FzIHG,ijEHl,sqGjf,KToyu,FxHQQ,ewPOf,,rcvDK,MWWYS,GIMJt,dCjbC,lLRPM,CgQye,kIHlc,OBaph,-1974.0,iKuWQ,ssoMZ,WRMpA,ugfFd,mPuLc,zxuaL,MkimP,Wsdvj,iNYVL,TbWBG,kVFfF,5,uadNh,kbAmh,ZmwnX,xzhZC,VloRD,qASvW,GriJs,,sZics,puFAh,bcpJn,Bjenx,,dcLfg,,yhGmw,,oRzdr,bDMtf,UVpbm,JqRWC,BplVZ,wcjbH,MGxdE,cbuDg,JMirk,rOmBS,JAnza,yVVfU,,-15833.0,EHjSq,WGkAx,YopdI,yfAST,4.0,elRKB,Sypvt,Ucdwk,ojvZG,bTxAJ,ZujmJ,qVMHa,3,QvEVs,GPQFq,dCpjP,IqJgG,HyDNL,kJcMb,BJIIK,,DTzrG,-8.0,bfgeg,wIdgm,lUoRY,EyqjN,sOBnN,IwXNs,qmOxG,TdcoU,kIJMX,ASpbn,hEagh,,JehJJ,hdaYV,VBmxq,yWVRJ,rlTrR,uCOQO,UYIFp,19,,MAFfK,ppEcI,-7987.0,uJXdA,True


In [37]:
for col in data.columns:
    print(col, data[col].dtypes)

cat_0 object
cat_1 object
cat_2 object
cat_3 object
cat_4 object
cat_5 object
num_6 float64
cat_7 object
num_8 float64
cat_9 object
cat_10 object
cat_11 object
cat_12 object
cat_13 object
cat_14 object
num_15 float64
cat_16 object
cat_17 object
cat_18 object
num_19 float64
num_20 float64
cat_21 object
cat_22 object
cat_23 object
cat_24 object
cat_25 object
cat_26 object
cat_27 object
cat_28 object
cat_29 object
cat_30 object
cat_31 object
cat_32 object
num_33 float64
num_34 float64
num_35 float64
cat_36 object
cat_37 object
cat_38 object
cat_39 object
cat_40 object
cat_41 object
cat_42 object
cat_43 object
cat_44 object
num_45 float64
cat_46 object
cat_47 object
cat_48 object
cat_49 object
cat_50 object
cat_51 object
cat_52 object
cat_53 object
cat_54 object
cat_55 object
cat_56 object
cat_57 object
cat_58 object
num_59 float64
cat_60 object
cat_61 object
cat_62 object
cat_63 object
cat_64 object
cat_65 object
cat_66 object
num_67 float64
cat_68 object
cat_69 object
num_70 int64
cat_71

In [38]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# poverty_C

In [39]:
dataset_name = "poverty_C"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}C_indiv_train.csv")
print(data.shape)
data.head()

(29913, 44)


Unnamed: 0,id,iid,OoqEwyJF,cJPCnaAs,vIUDQhEE,cRHfBOQd,dAmhsMgC,skzSFBmO,IsqnlKAj,AOSWkWKB,IThGOCux,gBTGbFhG,XKQWlRjk,CggtnNTN,TfEUOijE,tOfmAJyI,vWNISgEA,FRcdTUFo,wXJsPzSl,bsMfXBld,EaHvfzfT,XDnwpOpR,uOlSuJDG,CjuMStGt,ymHYOZZv,eXdISXQe,WqmeSStq,XKyOwsRR,XDzQvdCw,CgAkQtOd,UTyHNliH,poor,gxCmCVUk,qGqYslGF,dArQTUaf,NAxEQZVi,ShCKQiAy,rkLqZrQW,VGJlUgVG,kMVbipfP,sCTSWhXf,rVneGwzn,uVFOfrpa,country
0,30639,1,cEcbt,QhmHR,lJvCX,sgjYl,IpTiG,WFHaK,GpnOQ,nLEWH,PVIia,amOeQ,122,RxYsa,ucqiX,mlNXN,9,efxMi,cPXrX,9.0,xHsSQ,DUVzU,HelFU,lYVyA,GHDuu,dxzZA,XvXON,-3,PBUwt,-7.0,hHSXB,False,uSErG,SDNoA,iRUzd,Rihyc,INYbJ,SoOdX,VlcEt,zzxBZ,yQhuJ,xgpHA,DnIbO,C
1,30639,2,cEcbt,QhmHR,lJvCX,sgjYl,IpTiG,WFHaK,UaIsy,nLEWH,MyWVa,amOeQ,68,RxYsa,AzTqj,mlNXN,9,toWkd,cPXrX,9.0,xHsSQ,DUVzU,HelFU,lYVyA,GHDuu,dxzZA,DGWjH,-3,PBUwt,-7.0,hHSXB,False,uSErG,SDNoA,VYpjn,Rihyc,TYbsc,SoOdX,VlcEt,zzxBZ,yQhuJ,xgpHA,DnIbO,C
2,30639,3,cEcbt,cMeuH,lJvCX,sgjYl,IpTiG,WFHaK,JTCKs,nLEWH,ehUOC,KEvSa,1,hAGot,ISjaW,SSvEP,9,NwYCR,EFUMy,9.0,xHsSQ,DUVzU,HelFU,lYVyA,GHDuu,dxzZA,XvXON,-3,PBUwt,-7.0,FbTRU,False,wNFGx,SDNoA,iQpoG,GkrMH,xJurw,pbPGJ,YYwlj,rPkFE,yQhuJ,ldKFc,kXobL,C
3,30639,9,cEcbt,uSuzR,lJvCX,GpRit,uEstx,WFHaK,JTCKs,nLEWH,UrHEJ,JnveI,1,hAGot,ISjaW,SSvEP,104,NwYCR,cPXrX,-19.2,yFSGe,nvyhU,HelFU,lYVyA,Engma,dxzZA,XvXON,249,bgZsP,-53.8699,XFrNG,False,wNFGx,SDNoA,iQpoG,Rihyc,iuiyo,SoOdX,YYwlj,zzxBZ,yQhuJ,QGHnL,xRxWC,C
4,30639,10,cEcbt,uSuzR,lJvCX,GpRit,uEstx,WFHaK,JTCKs,nLEWH,UrHEJ,JnveI,1,hAGot,ISjaW,mlNXN,113,NwYCR,cPXrX,-27.0,yFSGe,nvyhU,HelFU,lYVyA,Engma,dxzZA,XvXON,237,bgZsP,-56.926,XFrNG,False,wNFGx,SDNoA,iQpoG,Rihyc,iuiyo,SoOdX,YYwlj,zzxBZ,yQhuJ,QGHnL,xRxWC,C


In [40]:
cols_dict = {
    "id_cols": ["id", "iid", "country"],
    "cat_cols": [col for col in data.columns if col not in ["id", "iid", "poor", "XKQWlRjk", "vWNISgEA", "bsMfXBld", 
                                                            "XKyOwsRR", "CgAkQtOd"]],
    "time_cols": []
}

In [41]:
data["target"] = data["poor"] == 1
data.drop(["poor"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [42]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  35


Unnamed: 0,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,num_10,cat_11,cat_12,cat_13,num_14,cat_15,cat_16,num_17,cat_18,cat_19,cat_20,cat_21,cat_22,cat_23,cat_24,num_25,cat_26,num_27,cat_28,cat_29,cat_30,cat_31,cat_32,cat_33,cat_34,cat_35,cat_36,cat_37,cat_38,cat_39,target
0,cEcbt,QhmHR,lJvCX,sgjYl,IpTiG,WFHaK,GpnOQ,nLEWH,PVIia,amOeQ,122,RxYsa,ucqiX,mlNXN,9,efxMi,cPXrX,9.0,xHsSQ,DUVzU,HelFU,lYVyA,GHDuu,dxzZA,XvXON,-3,PBUwt,-7.0,hHSXB,uSErG,SDNoA,iRUzd,Rihyc,INYbJ,SoOdX,VlcEt,zzxBZ,yQhuJ,xgpHA,DnIbO,False
1,cEcbt,QhmHR,lJvCX,sgjYl,IpTiG,WFHaK,UaIsy,nLEWH,MyWVa,amOeQ,68,RxYsa,AzTqj,mlNXN,9,toWkd,cPXrX,9.0,xHsSQ,DUVzU,HelFU,lYVyA,GHDuu,dxzZA,DGWjH,-3,PBUwt,-7.0,hHSXB,uSErG,SDNoA,VYpjn,Rihyc,TYbsc,SoOdX,VlcEt,zzxBZ,yQhuJ,xgpHA,DnIbO,False
2,cEcbt,cMeuH,lJvCX,sgjYl,IpTiG,WFHaK,JTCKs,nLEWH,ehUOC,KEvSa,1,hAGot,ISjaW,SSvEP,9,NwYCR,EFUMy,9.0,xHsSQ,DUVzU,HelFU,lYVyA,GHDuu,dxzZA,XvXON,-3,PBUwt,-7.0,FbTRU,wNFGx,SDNoA,iQpoG,GkrMH,xJurw,pbPGJ,YYwlj,rPkFE,yQhuJ,ldKFc,kXobL,False
3,cEcbt,uSuzR,lJvCX,GpRit,uEstx,WFHaK,JTCKs,nLEWH,UrHEJ,JnveI,1,hAGot,ISjaW,SSvEP,104,NwYCR,cPXrX,-19.2,yFSGe,nvyhU,HelFU,lYVyA,Engma,dxzZA,XvXON,249,bgZsP,-53.8699,XFrNG,wNFGx,SDNoA,iQpoG,Rihyc,iuiyo,SoOdX,YYwlj,zzxBZ,yQhuJ,QGHnL,xRxWC,False
4,cEcbt,uSuzR,lJvCX,GpRit,uEstx,WFHaK,JTCKs,nLEWH,UrHEJ,JnveI,1,hAGot,ISjaW,mlNXN,113,NwYCR,cPXrX,-27.0,yFSGe,nvyhU,HelFU,lYVyA,Engma,dxzZA,XvXON,237,bgZsP,-56.926,XFrNG,wNFGx,SDNoA,iQpoG,Rihyc,iuiyo,SoOdX,YYwlj,zzxBZ,yQhuJ,QGHnL,xRxWC,False


In [43]:
for col in data.columns:
    print(col, data[col].dtypes)

cat_0 object
cat_1 object
cat_2 object
cat_3 object
cat_4 object
cat_5 object
cat_6 object
cat_7 object
cat_8 object
cat_9 object
num_10 int64
cat_11 object
cat_12 object
cat_13 object
num_14 int64
cat_15 object
cat_16 object
num_17 float64
cat_18 object
cat_19 object
cat_20 object
cat_21 object
cat_22 object
cat_23 object
cat_24 object
num_25 int64
cat_26 object
num_27 float64
cat_28 object
cat_29 object
cat_30 object
cat_31 object
cat_32 object
cat_33 object
cat_34 object
cat_35 object
cat_36 object
cat_37 object
cat_38 object
cat_39 object
target bool


In [44]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# credit

In [45]:
# https://www.kaggle.com/c/home-credit-default-risk/data

In [46]:
dataset_name = "credit"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}application_train.csv.zip")
print(data.shape)
data.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
cols_dict = {
    "id_cols": ["SK_ID_CURR"],
    "cat_cols": ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
                 "REGION_RATING_CLIENT", "REGION_RATING_CLIENT_W_CITY", "NAME_TYPE_SUITE",
                 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
                 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
                 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
                 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
    "time_cols": []
}

In [48]:
data["target"] = data["TARGET"] == 1
data.drop(["TARGET"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [49]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  18


Unnamed: 0,cat_0,cat_1,cat_2,cat_3,num_4,num_5,num_6,num_7,num_8,cat_9,cat_10,cat_11,cat_12,cat_13,num_14,num_15,num_16,num_17,num_18,num_19,num_20,num_21,num_22,num_23,num_24,num_25,cat_26,num_27,cat_28,cat_29,cat_30,num_31,num_32,num_33,num_34,num_35,num_36,num_37,cat_38,num_39,num_40,num_41,num_42,num_43,num_44,num_45,num_46,num_47,num_48,num_49,num_50,num_51,num_52,num_53,num_54,num_55,num_56,num_57,num_58,num_59,num_60,num_61,num_62,num_63,num_64,num_65,num_66,num_67,num_68,num_69,num_70,num_71,num_72,num_73,num_74,num_75,num_76,num_77,num_78,num_79,num_80,num_81,num_82,num_83,cat_84,cat_85,num_86,cat_87,cat_88,num_89,num_90,num_91,num_92,num_93,num_94,num_95,num_96,num_97,num_98,num_99,num_100,num_101,num_102,num_103,num_104,num_105,num_106,num_107,num_108,num_109,num_110,num_111,num_112,num_113,num_114,num_115,num_116,num_117,num_118,num_119,target
0,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,True
1,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,False
2,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,new_category_nan,new_category_nan,,new_category_nan,new_category_nan,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,False
3,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,new_category_nan,new_category_nan,,new_category_nan,new_category_nan,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,False
4,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,new_category_nan,new_category_nan,,new_category_nan,new_category_nan,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [50]:
for col in data.columns:
    print(col, data[col].dtypes)

cat_0 object
cat_1 object
cat_2 object
cat_3 object
num_4 int64
num_5 float64
num_6 float64
num_7 float64
num_8 float64
cat_9 object
cat_10 object
cat_11 object
cat_12 object
cat_13 object
num_14 float64
num_15 int64
num_16 int64
num_17 float64
num_18 int64
num_19 float64
num_20 int64
num_21 int64
num_22 int64
num_23 int64
num_24 int64
num_25 int64
cat_26 object
num_27 float64
cat_28 object
cat_29 object
cat_30 object
num_31 int64
num_32 int64
num_33 int64
num_34 int64
num_35 int64
num_36 int64
num_37 int64
cat_38 object
num_39 float64
num_40 float64
num_41 float64
num_42 float64
num_43 float64
num_44 float64
num_45 float64
num_46 float64
num_47 float64
num_48 float64
num_49 float64
num_50 float64
num_51 float64
num_52 float64
num_53 float64
num_54 float64
num_55 float64
num_56 float64
num_57 float64
num_58 float64
num_59 float64
num_60 float64
num_61 float64
num_62 float64
num_63 float64
num_64 float64
num_65 float64
num_66 float64
num_67 float64
num_68 float64
num_69 float64
num_70 f

In [51]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# adult

In [52]:
# https://www.kaggle.com/wenruliu/adult-income-dataset

In [53]:
dataset_name = "adult"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}adult.csv.zip")
print(data.shape)
data.head()

(48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [54]:
cols_dict = {
    "id_cols": [],
    "cat_cols": ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"],
    "time_cols": []
}

In [55]:
data["target"] = data["income"] == ">50K"
data.drop(["income"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [56]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  8


Unnamed: 0,num_0,cat_1,num_2,cat_3,num_4,cat_5,cat_6,cat_7,cat_8,cat_9,num_10,num_11,num_12,cat_13,target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,False
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,False
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,True
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,True
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,False


In [57]:
for col in data.columns:
    print(col, data[col].dtypes)

num_0 int64
cat_1 object
num_2 int64
cat_3 object
num_4 int64
cat_5 object
cat_6 object
cat_7 object
cat_8 object
cat_9 object
num_10 int64
num_11 int64
num_12 int64
cat_13 object
target bool


In [58]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# employee

In [59]:
# https://www.kaggle.com/c/amazon-employee-access-challenge/data

In [60]:
dataset_name = "employee"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}train.csv")
print(data.shape)
data.head()

(32769, 10)


Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [61]:
cols_dict = {
    "id_cols": [],
    "cat_cols": data.columns,
    "time_cols": []
}

In [62]:
data["target"] = data["ACTION"] == 1
data.drop(["ACTION"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [63]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  9


Unnamed: 0,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,target
0,39353,85475,117961,118300,123472,117905,117906,290919,117908,True
1,17183,1540,117961,118343,123125,118536,118536,308574,118539,True
2,36724,14457,118219,118220,117884,117879,267952,19721,117880,True
3,36135,5396,117961,118343,119993,118321,240983,290919,118322,True
4,42680,5905,117929,117930,119569,119323,123932,19793,119325,True


In [64]:
for col in data.columns:
    print(col, data[col].dtypes)

cat_0 object
cat_1 object
cat_2 object
cat_3 object
cat_4 object
cat_5 object
cat_6 object
cat_7 object
cat_8 object
target bool


In [65]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# kick

In [66]:
# https://www.kaggle.com/c/DontGetKicked/data

In [67]:
dataset_name = "kick"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}training.csv")
print(data.shape)
data.head()

(72983, 34)


Unnamed: 0,RefId,IsBadBuy,PurchDate,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,Color,Transmission,WheelTypeID,WheelType,VehOdo,Nationality,Size,TopThreeAmericanName,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,MMRCurrentAuctionCleanPrice,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,PRIMEUNIT,AUCGUART,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost
0,1,0,12/7/2009,ADESA,2006,3,MAZDA,MAZDA3,i,4D SEDAN I,RED,AUTO,1.0,Alloy,89046,OTHER ASIAN,MEDIUM,OTHER,8155.0,9829.0,11636.0,13600.0,7451.0,8552.0,11597.0,12409.0,,,21973,33619,FL,7100.0,0,1113
1,2,0,12/7/2009,ADESA,2004,5,DODGE,1500 RAM PICKUP 2WD,ST,QUAD CAB 4.7L SLT,WHITE,AUTO,1.0,Alloy,93593,AMERICAN,LARGE TRUCK,CHRYSLER,6854.0,8383.0,10897.0,12572.0,7456.0,9222.0,11374.0,12791.0,,,19638,33619,FL,7600.0,0,1053
2,3,0,12/7/2009,ADESA,2005,4,DODGE,STRATUS V6,SXT,4D SEDAN SXT FFV,MAROON,AUTO,2.0,Covers,73807,AMERICAN,MEDIUM,CHRYSLER,3202.0,4760.0,6943.0,8457.0,4035.0,5557.0,7146.0,8702.0,,,19638,33619,FL,4900.0,0,1389
3,4,0,12/7/2009,ADESA,2004,5,DODGE,NEON,SXT,4D SEDAN,SILVER,AUTO,1.0,Alloy,65617,AMERICAN,COMPACT,CHRYSLER,1893.0,2675.0,4658.0,5690.0,1844.0,2646.0,4375.0,5518.0,,,19638,33619,FL,4100.0,0,630
4,5,0,12/7/2009,ADESA,2005,4,FORD,FOCUS,ZX3,2D COUPE ZX3,SILVER,MANUAL,2.0,Covers,69367,AMERICAN,COMPACT,FORD,3913.0,5054.0,7723.0,8707.0,3247.0,4384.0,6739.0,7911.0,,,19638,33619,FL,4000.0,0,1020


In [68]:
cols_dict = {
    "id_cols": ["RefId"],
    "cat_cols": ["Auction", "Make", "Model", "Trim", "SubModel", "Color", "Transmission", 
                 "WheelTypeID", "WheelType", "VehOdo", "Nationality", "Size", "TopThreeAmericanName", 
                 "PRIMEUNIT", "AUCGUART", "BYRNO", "VNZIP1", "VNST", "IsOnlineSale"],
    "time_cols": ["PurchDate"]
}

In [69]:
data["target"] = data["IsBadBuy"] == 1
data.drop(["IsBadBuy"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [70]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  19


Unnamed: 0,cat_0,num_1,num_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,num_15,num_16,num_17,num_18,num_19,num_20,num_21,num_22,cat_23,cat_24,cat_25,cat_26,cat_27,num_28,cat_29,num_30,target
0,ADESA,2006,3,MAZDA,MAZDA3,i,4D SEDAN I,RED,AUTO,1.0,Alloy,89046,OTHER ASIAN,MEDIUM,OTHER,8155.0,9829.0,11636.0,13600.0,7451.0,8552.0,11597.0,12409.0,new_category_nan,new_category_nan,21973,33619,FL,7100.0,0,1113,False
1,ADESA,2004,5,DODGE,1500 RAM PICKUP 2WD,ST,QUAD CAB 4.7L SLT,WHITE,AUTO,1.0,Alloy,93593,AMERICAN,LARGE TRUCK,CHRYSLER,6854.0,8383.0,10897.0,12572.0,7456.0,9222.0,11374.0,12791.0,new_category_nan,new_category_nan,19638,33619,FL,7600.0,0,1053,False
2,ADESA,2005,4,DODGE,STRATUS V6,SXT,4D SEDAN SXT FFV,MAROON,AUTO,2.0,Covers,73807,AMERICAN,MEDIUM,CHRYSLER,3202.0,4760.0,6943.0,8457.0,4035.0,5557.0,7146.0,8702.0,new_category_nan,new_category_nan,19638,33619,FL,4900.0,0,1389,False
3,ADESA,2004,5,DODGE,NEON,SXT,4D SEDAN,SILVER,AUTO,1.0,Alloy,65617,AMERICAN,COMPACT,CHRYSLER,1893.0,2675.0,4658.0,5690.0,1844.0,2646.0,4375.0,5518.0,new_category_nan,new_category_nan,19638,33619,FL,4100.0,0,630,False
4,ADESA,2005,4,FORD,FOCUS,ZX3,2D COUPE ZX3,SILVER,MANUAL,2.0,Covers,69367,AMERICAN,COMPACT,FORD,3913.0,5054.0,7723.0,8707.0,3247.0,4384.0,6739.0,7911.0,new_category_nan,new_category_nan,19638,33619,FL,4000.0,0,1020,False


In [71]:
for col in data.columns:
    print(col, data[col].dtypes)

cat_0 object
num_1 int64
num_2 int64
cat_3 object
cat_4 object
cat_5 object
cat_6 object
cat_7 object
cat_8 object
cat_9 object
cat_10 object
cat_11 object
cat_12 object
cat_13 object
cat_14 object
num_15 float64
num_16 float64
num_17 float64
num_18 float64
num_19 float64
num_20 float64
num_21 float64
num_22 float64
cat_23 object
cat_24 object
cat_25 object
cat_26 object
cat_27 object
num_28 float64
cat_29 object
num_30 int64
target bool


In [72]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# promotion

In [73]:
# https://datahack.analyticsvidhya.com/contest/wns-analytics-hackathon-2018/

In [74]:
dataset_name = "promotion"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}train_LZdllcl.csv")
print(data.shape)
data.head()

(54808, 14)


Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [75]:
cols_dict = {
    "id_cols": ["employee_id"],
    "cat_cols": ["department", "region", "education", "gender", "recruitment_channel"],
    "time_cols": []
}

In [76]:
data["target"] = data["is_promoted"] == 1
data.drop(["is_promoted"] + cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [77]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  5


Unnamed: 0,cat_0,cat_1,cat_2,cat_3,cat_4,num_5,num_6,num_7,num_8,num_9,num_10,num_11,target
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,False
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,False
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,False
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,False
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,False


In [78]:
for col in data.columns:
    print(col, data[col].dtypes)

cat_0 object
cat_1 object
cat_2 object
cat_3 object
cat_4 object
num_5 int64
num_6 int64
num_7 float64
num_8 int64
num_9 int64
num_10 int64
num_11 int64
target bool


In [79]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)

# kdd_upselling

In [80]:
# https://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data

In [81]:
dataset_name = "kdd_upselling"
data_pth = f"../data/{dataset_name}/"
data = pd.read_csv(f"{data_pth}orange_small_train.data", sep="\t")
print(data.shape)
data.head()

(50000, 230)


Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,Var11,Var12,Var13,Var14,Var15,Var16,Var17,Var18,Var19,Var20,Var21,Var22,Var23,Var24,Var25,Var26,Var27,Var28,Var29,Var30,Var31,Var32,Var33,Var34,Var35,Var36,Var37,Var38,Var39,Var40,Var41,Var42,Var43,Var44,Var45,Var46,Var47,Var48,Var49,Var50,Var51,Var52,Var53,Var54,Var55,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65,Var66,Var67,Var68,Var69,Var70,Var71,Var72,Var73,Var74,Var75,Var76,Var77,Var78,Var79,Var80,Var81,Var82,Var83,Var84,Var85,Var86,Var87,Var88,Var89,Var90,Var91,Var92,Var93,Var94,Var95,Var96,Var97,Var98,Var99,Var100,Var101,Var102,Var103,Var104,Var105,Var106,Var107,Var108,Var109,Var110,Var111,Var112,Var113,Var114,Var115,Var116,Var117,Var118,Var119,Var120,Var121,Var122,Var123,Var124,Var125,Var126,Var127,Var128,Var129,Var130,Var131,Var132,Var133,Var134,Var135,Var136,Var137,Var138,Var139,Var140,Var141,Var142,Var143,Var144,Var145,Var146,Var147,Var148,Var149,Var150,Var151,Var152,Var153,Var154,Var155,Var156,Var157,Var158,Var159,Var160,Var161,Var162,Var163,Var164,Var165,Var166,Var167,Var168,Var169,Var170,Var171,Var172,Var173,Var174,Var175,Var176,Var177,Var178,Var179,Var180,Var181,Var182,Var183,Var184,Var185,Var186,Var187,Var188,Var189,Var190,Var191,Var192,Var193,Var194,Var195,Var196,Var197,Var198,Var199,Var200,Var201,Var202,Var203,Var204,Var205,Var206,Var207,Var208,Var209,Var210,Var211,Var212,Var213,Var214,Var215,Var216,Var217,Var218,Var219,Var220,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,,,,,,1526.0,7.0,,,,,,184.0,,,,,,,,464.0,580.0,,14.0,128.0,,,166.56,,,,,,,0.0,,,3570.0,,,,,,0.0,,,,,,,,,,,,,4.076907,,,,,,,,9.0,,,,,,,,36,35.0,,1350864.0,,0.0,,,7333.11,,5.0,,12.0,,,,,,,,,,,,,,,,,,,,,,,,104.0,,,168.0,117625.6,,,,,,1175.0,,,,6.0,,720.0,8.0,,,,,,0.0,1212385.0,69134.0,,,,,,185.0,,,0.0,9.0,,,,,397579.0,,,,1812252.0,,,,,,,142.0,,,38418.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,462.0,,,bZkvyxLkBI,RO12,,taul,1K8T,lK27,ka_ns41,nQUveAzAF7,,,dXGu,9_Y1,FbIm,VpdQ,haYg,me75fM6ugJ,kIsH,,uKAI,L84s,XfqtO3UdzaXh_,,,,XTbPUYD,sH5Z,cJvF,FzaX,1YVfGrO,oslk,fXVEsaq,jySVZNlOJy,,,xb3V,RAYp,F2FyR07IdsN7I,,
1,,,,,,525.0,0.0,,,,,,0.0,,,,,,,,168.0,210.0,,2.0,24.0,,,353.52,,,,,,,0.0,,,4764966.0,,,,,,0.0,,,,,,,,,,,,,5.408032,,,,,,,,9.0,,,,,,,3.0,26,0.0,,2872928.0,,3.0,,,151098.9,,25.0,,2.0,,,,,,,,,58158.0,,,,,,,,,,,,,,,40.0,,,40.0,-356411.6,,,,,,590.0,,,,72.0,,0.0,,,,,,,8.0,4136430.0,357038.0,,,,,,0.0,,,0.0,9.0,,,,,278334.0,,,,10439160.0,,,,,,,32.0,,,238572.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,,CEat0G8rTN,RO12,,taul,1K8T,2Ix5,qEdASpP,y2LIM01bE1,,,lg1t,9_Y1,k13i,sJzTlal,zm5i,me75fM6ugJ,kIsH,,uKAI,L84s,NhsEn4L,,,,kZJyVg2,,,FzaX,0AJo2f2,oslk,2Kb5FSF,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,
2,,,,,,5236.0,7.0,,,,,,904.0,,,,,,,,1212.0,1515.0,,26.0,816.0,,,220.08,,,,,,,0.0,,,5883894.0,,,,,,0.0,,,,,,,,,,,,,6.599658,,,,,,,,9.0,,,,,,,,130,518.0,,1675776.0,,0.0,,,16211.58,,40.0,,58.0,,,,,,,,,,,,,,,,,,,,,,,,312.0,,,336.0,405104.0,,,,,,3230.0,,,,114.0,,5967.0,-28.0,,,,,,0.0,3478905.0,248932.0,,,,,,800.0,,,0.0,36.0,,,,,320565.0,,,,9826360.0,,,,,,,206.0,,,434946.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,,eOQt0GoOh3,AERks4l,SEuy,taul,1K8T,ffXs,NldASpP,y4g9XoZ,vynJTq9,smXZ,4bTR,9_Y1,MGOA,VpdQ,haYg,DHn_WUyBhW_whjA88g9bvA64_,kIsH,,uKAI,L84s,UbxQ8lZ,,TTGHfSv,,pMWAe2U,bHR7,UYBR,FzaX,JFM1BiF,Al6ZaUT,NKv4yOc,jySVZNlOJy,,kG3k,Qu4f,02N6s8f,ib5G6X1eUxUn6,am7c,
3,,,,,,,0.0,,,,,,0.0,,,,,,,,,0.0,,,0.0,,,22.08,,,,,,,0.0,,,0.0,,,,,,0.0,,,,,,,,,,,,,1.98825,,,,,,,,9.0,,,,,,,,12,0.0,,0.0,,0.0,,,,,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,-275703.6,,,,,,,,,,0.0,,0.0,-14.0,,,,,,0.0,0.0,0.0,,,,,,0.0,,,0.0,,,,,,,,,,0.0,,,,,,,0.0,,,0.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,,jg69tYsGvO,RO12,,taul,1K8T,ssAy,_ybO0dd,4hMlgkf58mhwh,,,W8mQ,9_Y1,YULl,VpdQ,,me75fM6ugJ,kIsH,,uKAI,Mtgm,NhsEn4L,,,,kq0dQfu,eKej,UYBR,FzaX,L91KIiz,oslk,CE7uk3u,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,
4,,,,,,1029.0,7.0,,,,,,3216.0,,,,,,,,64.0,80.0,,4.0,64.0,,,200.0,,,,,,,0.0,,,0.0,,,,,,0.0,,,,,,,,,,,,,4.552446,,,,,,,,18.0,,,,,,,3.0,82,224.0,,784448.0,,0.0,,,37423.5,,0.0,,0.0,,,,,,,,,89754.0,,,,,,,,,,,,,,,32.0,,,56.0,10714.84,,,,,,215.0,,,,0.0,,15111.0,58.0,,,,,,0.0,150650.0,66046.0,,,,,,3255.0,,,0.0,9.0,,,,,267162.0,,,,644836.0,,,,,,,2.0,,,0.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,,IXSgUHShse,RO12,SEuy,taul,1K8T,uNkU,EKR938I,ThrHXVS,0v21jmy,smXZ,xklU,9_Y1,RVjC,sJzTlal,6JmL,me75fM6ugJ,kIsH,,uKAI,L84s,XfqtO3UdzaXh_,,SJs3duv,,11p4mKe,H3p7,UYBR,FzaX,OrnLfvc,oslk,1J2cvxe,LM8l689qOp,,kG3k,FSa2,RAYp,F2FyR07IdsN7I,mj86,


In [82]:
targets = pd.read_csv(f"{data_pth}orange_small_train_upselling.labels.txt", header=None)

In [83]:
cols_dict = {
    "id_cols": [],
    "cat_cols": data.columns[-40:],
    "time_cols": []
}

In [84]:
data["target"] = targets.values == 1
data.drop(cols_dict["time_cols"] + cols_dict["id_cols"], axis=1, inplace=True)

In [85]:
data = process_cols(data, cols_dict)
data.head()

Tot cat columns :  40


Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,num_8,num_9,num_10,num_11,num_12,num_13,num_14,num_15,num_16,num_17,num_18,num_19,num_20,num_21,num_22,num_23,num_24,num_25,num_26,num_27,num_28,num_29,num_30,num_31,num_32,num_33,num_34,num_35,num_36,num_37,num_38,num_39,num_40,num_41,num_42,num_43,num_44,num_45,num_46,num_47,num_48,num_49,num_50,num_51,num_52,num_53,num_54,num_55,num_56,num_57,num_58,num_59,num_60,num_61,num_62,num_63,num_64,num_65,num_66,num_67,num_68,num_69,num_70,num_71,num_72,num_73,num_74,num_75,num_76,num_77,num_78,num_79,num_80,num_81,num_82,num_83,num_84,num_85,num_86,num_87,num_88,num_89,num_90,num_91,num_92,num_93,num_94,num_95,num_96,num_97,num_98,num_99,num_100,num_101,num_102,num_103,num_104,num_105,num_106,num_107,num_108,num_109,num_110,num_111,num_112,num_113,num_114,num_115,num_116,num_117,num_118,num_119,num_120,num_121,num_122,num_123,num_124,num_125,num_126,num_127,num_128,num_129,num_130,num_131,num_132,num_133,num_134,num_135,num_136,num_137,num_138,num_139,num_140,num_141,num_142,num_143,num_144,num_145,num_146,num_147,num_148,num_149,num_150,num_151,num_152,num_153,num_154,num_155,num_156,num_157,num_158,num_159,num_160,num_161,num_162,num_163,num_164,num_165,num_166,num_167,num_168,num_169,num_170,num_171,num_172,num_173,num_174,num_175,num_176,num_177,num_178,num_179,num_180,num_181,num_182,num_183,num_184,num_185,num_186,num_187,num_188,num_189,cat_190,cat_191,cat_192,cat_193,cat_194,cat_195,cat_196,cat_197,cat_198,cat_199,cat_200,cat_201,cat_202,cat_203,cat_204,cat_205,cat_206,cat_207,cat_208,cat_209,cat_210,cat_211,cat_212,cat_213,cat_214,cat_215,cat_216,cat_217,cat_218,cat_219,cat_220,cat_221,cat_222,cat_223,cat_224,cat_225,cat_226,cat_227,cat_228,cat_229,target
0,,,,,,1526.0,7.0,,,,,,184.0,,,,,,,,464.0,580.0,,14.0,128.0,,,166.56,,,,,,,0.0,,,3570.0,,,,,,0.0,,,,,,,,,,,,,4.076907,,,,,,,,9.0,,,,,,,,36,35.0,,1350864.0,,0.0,,,7333.11,,5.0,,12.0,,,,,,,,,,,,,,,,,,,,,,,,104.0,,,168.0,117625.6,,,,,,1175.0,,,,6.0,,720.0,8.0,,,,,,0.0,1212385.0,69134.0,,,,,,185.0,,,0.0,9.0,,,,,397579.0,,,,1812252.0,,,,,,,142.0,,,38418.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,462.0,,new_category_nan,bZkvyxLkBI,RO12,new_category_nan,taul,1K8T,lK27,ka_ns41,nQUveAzAF7,new_category_nan,new_category_nan,dXGu,9_Y1,FbIm,VpdQ,haYg,me75fM6ugJ,kIsH,new_category_nan,uKAI,L84s,XfqtO3UdzaXh_,new_category_nan,new_category_nan,new_category_nan,XTbPUYD,sH5Z,cJvF,FzaX,1YVfGrO,oslk,fXVEsaq,jySVZNlOJy,new_category_nan,new_category_nan,xb3V,RAYp,F2FyR07IdsN7I,new_category_nan,new_category_nan,False
1,,,,,,525.0,0.0,,,,,,0.0,,,,,,,,168.0,210.0,,2.0,24.0,,,353.52,,,,,,,0.0,,,4764966.0,,,,,,0.0,,,,,,,,,,,,,5.408032,,,,,,,,9.0,,,,,,,3.0,26,0.0,,2872928.0,,3.0,,,151098.9,,25.0,,2.0,,,,,,,,,58158.0,,,,,,,,,,,,,,,40.0,,,40.0,-356411.6,,,,,,590.0,,,,72.0,,0.0,,,,,,,8.0,4136430.0,357038.0,,,,,,0.0,,,0.0,9.0,,,,,278334.0,,,,10439160.0,,,,,,,32.0,,,238572.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,new_category_nan,CEat0G8rTN,RO12,new_category_nan,taul,1K8T,2Ix5,qEdASpP,y2LIM01bE1,new_category_nan,new_category_nan,lg1t,9_Y1,k13i,sJzTlal,zm5i,me75fM6ugJ,kIsH,new_category_nan,uKAI,L84s,NhsEn4L,new_category_nan,new_category_nan,new_category_nan,kZJyVg2,new_category_nan,new_category_nan,FzaX,0AJo2f2,oslk,2Kb5FSF,LM8l689qOp,new_category_nan,new_category_nan,fKCe,RAYp,F2FyR07IdsN7I,new_category_nan,new_category_nan,False
2,,,,,,5236.0,7.0,,,,,,904.0,,,,,,,,1212.0,1515.0,,26.0,816.0,,,220.08,,,,,,,0.0,,,5883894.0,,,,,,0.0,,,,,,,,,,,,,6.599658,,,,,,,,9.0,,,,,,,,130,518.0,,1675776.0,,0.0,,,16211.58,,40.0,,58.0,,,,,,,,,,,,,,,,,,,,,,,,312.0,,,336.0,405104.0,,,,,,3230.0,,,,114.0,,5967.0,-28.0,,,,,,0.0,3478905.0,248932.0,,,,,,800.0,,,0.0,36.0,,,,,320565.0,,,,9826360.0,,,,,,,206.0,,,434946.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,new_category_nan,eOQt0GoOh3,AERks4l,SEuy,taul,1K8T,ffXs,NldASpP,y4g9XoZ,vynJTq9,smXZ,4bTR,9_Y1,MGOA,VpdQ,haYg,DHn_WUyBhW_whjA88g9bvA64_,kIsH,new_category_nan,uKAI,L84s,UbxQ8lZ,new_category_nan,TTGHfSv,new_category_nan,pMWAe2U,bHR7,UYBR,FzaX,JFM1BiF,Al6ZaUT,NKv4yOc,jySVZNlOJy,new_category_nan,kG3k,Qu4f,02N6s8f,ib5G6X1eUxUn6,am7c,new_category_nan,False
3,,,,,,,0.0,,,,,,0.0,,,,,,,,,0.0,,,0.0,,,22.08,,,,,,,0.0,,,0.0,,,,,,0.0,,,,,,,,,,,,,1.98825,,,,,,,,9.0,,,,,,,,12,0.0,,0.0,,0.0,,,,,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,-275703.6,,,,,,,,,,0.0,,0.0,-14.0,,,,,,0.0,0.0,0.0,,,,,,0.0,,,0.0,,,,,,,,,,0.0,,,,,,,0.0,,,0.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,new_category_nan,jg69tYsGvO,RO12,new_category_nan,taul,1K8T,ssAy,_ybO0dd,4hMlgkf58mhwh,new_category_nan,new_category_nan,W8mQ,9_Y1,YULl,VpdQ,new_category_nan,me75fM6ugJ,kIsH,new_category_nan,uKAI,Mtgm,NhsEn4L,new_category_nan,new_category_nan,new_category_nan,kq0dQfu,eKej,UYBR,FzaX,L91KIiz,oslk,CE7uk3u,LM8l689qOp,new_category_nan,new_category_nan,FSa2,RAYp,F2FyR07IdsN7I,new_category_nan,new_category_nan,False
4,,,,,,1029.0,7.0,,,,,,3216.0,,,,,,,,64.0,80.0,,4.0,64.0,,,200.0,,,,,,,0.0,,,0.0,,,,,,0.0,,,,,,,,,,,,,4.552446,,,,,,,,18.0,,,,,,,3.0,82,224.0,,784448.0,,0.0,,,37423.5,,0.0,,0.0,,,,,,,,,89754.0,,,,,,,,,,,,,,,32.0,,,56.0,10714.84,,,,,,215.0,,,,0.0,,15111.0,58.0,,,,,,0.0,150650.0,66046.0,,,,,,3255.0,,,0.0,9.0,,,,,267162.0,,,,644836.0,,,,,,,2.0,,,0.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,new_category_nan,IXSgUHShse,RO12,SEuy,taul,1K8T,uNkU,EKR938I,ThrHXVS,0v21jmy,smXZ,xklU,9_Y1,RVjC,sJzTlal,6JmL,me75fM6ugJ,kIsH,new_category_nan,uKAI,L84s,XfqtO3UdzaXh_,new_category_nan,SJs3duv,new_category_nan,11p4mKe,H3p7,UYBR,FzaX,OrnLfvc,oslk,1J2cvxe,LM8l689qOp,new_category_nan,kG3k,FSa2,RAYp,F2FyR07IdsN7I,mj86,new_category_nan,False


In [86]:
for col in data.columns:
    print(col, data[col].dtypes)

num_0 float64
num_1 float64
num_2 float64
num_3 float64
num_4 float64
num_5 float64
num_6 float64
num_7 float64
num_8 float64
num_9 float64
num_10 float64
num_11 float64
num_12 float64
num_13 float64
num_14 float64
num_15 float64
num_16 float64
num_17 float64
num_18 float64
num_19 float64
num_20 float64
num_21 float64
num_22 float64
num_23 float64
num_24 float64
num_25 float64
num_26 float64
num_27 float64
num_28 float64
num_29 float64
num_30 float64
num_31 float64
num_32 float64
num_33 float64
num_34 float64
num_35 float64
num_36 float64
num_37 float64
num_38 float64
num_39 float64
num_40 float64
num_41 float64
num_42 float64
num_43 float64
num_44 float64
num_45 float64
num_46 float64
num_47 float64
num_48 float64
num_49 float64
num_50 float64
num_51 float64
num_52 float64
num_53 float64
num_54 float64
num_55 float64
num_56 float64
num_57 float64
num_58 float64
num_59 float64
num_60 float64
num_61 float64
num_62 float64
num_63 float64
num_64 float64
num_65 float64
num_66 float64
num_6

In [87]:
data.to_csv(f"{data_pth}{dataset_name}.gz", compression='gzip', index=False)