## Import modules and libraries

In [56]:
from utils_data_preparation import *
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

In [57]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [58]:
def timedelta_with_general_date(df, general_date=None, drop = True):
    dates = df.select_dtypes('datetime').columns
    dates = set(dates) - set([general_date])
    for i in dates:
        df['Diff_days'+general_date +'_bt_'+ i] = (df[general_date] - df[i]).astype('timedelta64[D]')
        df['Diff_month'+general_date +'_bt_'+ i] = (df[general_date] - df[i]).astype('timedelta64[M]')
        df['Diff_year'+general_date +'_bt_'+ i] = (df[general_date] - df[i]).astype('timedelta64[Y]')
    if drop: df.drop(dates, axis=1, inplace=True)
    return df

## Train

### Load data

In [59]:
df_train = pd.read_csv(path/'Train.csv')

In [60]:
df_train.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,ID_0040R73,2010-05-14,2011-05-13,Male,30,2010-05-14,1,Saloon,Black,TOYOTA,,,Car Classic,0
1,ID_0046BNK,2010-11-29,2011-11-28,Female,79,2010-11-29,1,JEEP,Grey,TOYOTA,,,Car Classic,1
2,ID_005QMC3,2010-03-21,2011-03-20,Male,43,2010-03-21,1,Saloon,Red,TOYOTA,,,Car Classic,0
3,ID_0079OHW,2010-08-21,2011-08-20,Male,2,2010-08-21,1,,,,,,CarSafe,0
4,ID_00BRP63,2010-08-29,2010-12-31,Entity,20,2010-08-29,3,,,,Lagos,Lagos,Muuve,1


In [61]:
df_train.shape

(12079, 14)

In [62]:
df_train.describe()

Unnamed: 0,Age,No_Pol,target
count,12079.0,12079.0,12079.0
mean,42.23454,1.30723,0.12046
std,97.49256,0.73308,0.32551
min,-6099.0,1.0,0.0
25%,35.0,1.0,0.0
50%,41.0,1.0,0.0
75%,50.0,1.0,0.0
max,320.0,10.0,1.0


In [63]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12079 entries, 0 to 12078
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      12079 non-null  object
 1   Policy Start Date       12079 non-null  object
 2   Policy End Date         12079 non-null  object
 3   Gender                  11720 non-null  object
 4   Age                     12079 non-null  int64 
 5   First Transaction Date  12079 non-null  object
 6   No_Pol                  12079 non-null  int64 
 7   Car_Category            8341 non-null   object
 8   Subject_Car_Colour      5117 non-null   object
 9   Subject_Car_Make        9603 non-null   object
 10  LGA_Name                5603 non-null   object
 11  State                   5591 non-null   object
 12  ProductName             12079 non-null  object
 13  target                  12079 non-null  int64 
dtypes: int64(3), object(11)
memory usage: 1.3+ MB


### Data preparation

In [64]:
df_train['Policy Start Date'] = pd.to_datetime(df_train['Policy Start Date'] )
df_train['Policy End Date'] = pd.to_datetime(df_train['Policy End Date'] )
df_train['First Transaction Date'] = pd.to_datetime(df_train['First Transaction Date'] )

In [65]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12079 entries, 0 to 12078
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   ID                      12079 non-null  object        
 1   Policy Start Date       12079 non-null  datetime64[ns]
 2   Policy End Date         12079 non-null  datetime64[ns]
 3   Gender                  11720 non-null  object        
 4   Age                     12079 non-null  int64         
 5   First Transaction Date  12079 non-null  datetime64[ns]
 6   No_Pol                  12079 non-null  int64         
 7   Car_Category            8341 non-null   object        
 8   Subject_Car_Colour      5117 non-null   object        
 9   Subject_Car_Make        9603 non-null   object        
 10  LGA_Name                5603 non-null   object        
 11  State                   5591 non-null   object        
 12  ProductName             12079 non-null  object

In [66]:
df_train.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,ID_0040R73,2010-05-14,2011-05-13,Male,30,2010-05-14,1,Saloon,Black,TOYOTA,,,Car Classic,0
1,ID_0046BNK,2010-11-29,2011-11-28,Female,79,2010-11-29,1,JEEP,Grey,TOYOTA,,,Car Classic,1
2,ID_005QMC3,2010-03-21,2011-03-20,Male,43,2010-03-21,1,Saloon,Red,TOYOTA,,,Car Classic,0
3,ID_0079OHW,2010-08-21,2011-08-20,Male,2,2010-08-21,1,,,,,,CarSafe,0
4,ID_00BRP63,2010-08-29,2010-12-31,Entity,20,2010-08-29,3,,,,Lagos,Lagos,Muuve,1


#### Adding timedelta

In [67]:
timedelta_with_general_date(df_train, 'Policy Start Date',drop = False)

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target,Diff_daysPolicy Start Date_bt_First Transaction Date,Diff_monthPolicy Start Date_bt_First Transaction Date,Diff_yearPolicy Start Date_bt_First Transaction Date,Diff_daysPolicy Start Date_bt_Policy End Date,Diff_monthPolicy Start Date_bt_Policy End Date,Diff_yearPolicy Start Date_bt_Policy End Date
0,ID_0040R73,2010-05-14,2011-05-13,Male,30,2010-05-14,1,Saloon,Black,TOYOTA,,,Car Classic,0,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
1,ID_0046BNK,2010-11-29,2011-11-28,Female,79,2010-11-29,1,JEEP,Grey,TOYOTA,,,Car Classic,1,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
2,ID_005QMC3,2010-03-21,2011-03-20,Male,43,2010-03-21,1,Saloon,Red,TOYOTA,,,Car Classic,0,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
3,ID_0079OHW,2010-08-21,2011-08-20,Male,2,2010-08-21,1,,,,,,CarSafe,0,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
4,ID_00BRP63,2010-08-29,2010-12-31,Entity,20,2010-08-29,3,,,,Lagos,Lagos,Muuve,1,0.00000,0.00000,0.00000,-124.00000,-5.00000,-1.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,ID_ZZA1SES,2010-05-25,2011-05-24,Female,30,2010-05-25,1,,Black,Range Rover,Ibeju/Lekki,Ibeju-Lekki,Car Classic,1,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
12075,ID_ZZDAC3K,2010-10-03,2011-10-02,Female,59,2010-10-03,1,,,,,,Car Classic,0,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
12076,ID_ZZIU2XC,2010-10-10,2011-10-08,Male,34,2010-10-10,1,,,,,,CarSafe,0,0.00000,0.00000,0.00000,-363.00000,-12.00000,-1.00000
12077,ID_ZZRQ1NF,2010-02-27,2011-02-26,,120,2010-02-27,2,,White,TOYOTA,Victoria Island,Lagos,CVTP,0,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000


In [68]:
date_cols = df_train.select_dtypes(include = ['datetime64[ns]']).columns

In [69]:
date_cols

Index(['Policy Start Date', 'Policy End Date', 'First Transaction Date'], dtype='object')

In [70]:
df_train.drop(columns = date_cols, inplace = True)

#### Encode categoricals

In [71]:
cat_cols = df_train.drop(columns='ID').select_dtypes(include=['object']).columns

In [72]:
cat_cols

Index(['Gender', 'Car_Category', 'Subject_Car_Colour', 'Subject_Car_Make',
       'LGA_Name', 'State', 'ProductName'],
      dtype='object')

In [73]:
CatPrep = EncodeCategorical(df_train[cat_cols].columns)

In [74]:
df_train[cat_cols].head()

Unnamed: 0,Gender,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName
0,Male,Saloon,Black,TOYOTA,,,Car Classic
1,Female,JEEP,Grey,TOYOTA,,,Car Classic
2,Male,Saloon,Red,TOYOTA,,,Car Classic
3,Male,,,,,,CarSafe
4,Entity,,,,Lagos,Lagos,Muuve


In [75]:
CatPrep(df_train)

In [76]:
df_train[cat_cols].head()

Unnamed: 0,Gender,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName
0,4,9,5,68,0,0,2
1,2,3,27,68,0,0,2
2,4,9,33,68,0,0,2
3,4,0,0,0,0,0,6
4,1,0,0,0,158,74,9


In [77]:
CatPrep.save(path/'CatPrep.pkl')

#### Fill Missing

In [78]:
num_cols = df_train.drop(columns=['target']).select_dtypes(exclude=['object','datetime64[ns]']).columns

In [79]:
num_cols

Index(['Gender', 'Age', 'No_Pol', 'Car_Category', 'Subject_Car_Colour',
       'Subject_Car_Make', 'LGA_Name', 'State', 'ProductName',
       'Diff_daysPolicy Start Date_bt_First Transaction Date',
       'Diff_monthPolicy Start Date_bt_First Transaction Date',
       'Diff_yearPolicy Start Date_bt_First Transaction Date',
       'Diff_daysPolicy Start Date_bt_Policy End Date',
       'Diff_monthPolicy Start Date_bt_Policy End Date',
       'Diff_yearPolicy Start Date_bt_Policy End Date'],
      dtype='object')

In [80]:
missing_columns = df_train[num_cols].loc[:,df_train.isnull().sum()>0].columns

In [81]:
missing_columns

Index([], dtype='object')

In [82]:
df_train.isnull().sum().sum()

0

#### Save ready_train

In [83]:
df_train.reset_index(inplace = True, drop = True)

In [84]:
df_train.head()

Unnamed: 0,ID,Gender,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target,Diff_daysPolicy Start Date_bt_First Transaction Date,Diff_monthPolicy Start Date_bt_First Transaction Date,Diff_yearPolicy Start Date_bt_First Transaction Date,Diff_daysPolicy Start Date_bt_Policy End Date,Diff_monthPolicy Start Date_bt_Policy End Date,Diff_yearPolicy Start Date_bt_Policy End Date
0,ID_0040R73,4,30,1,9,5,68,0,0,2,0,0.0,0.0,0.0,-364.0,-12.0,-1.0
1,ID_0046BNK,2,79,1,3,27,68,0,0,2,1,0.0,0.0,0.0,-364.0,-12.0,-1.0
2,ID_005QMC3,4,43,1,9,33,68,0,0,2,0,0.0,0.0,0.0,-364.0,-12.0,-1.0
3,ID_0079OHW,4,2,1,0,0,0,0,0,6,0,0.0,0.0,0.0,-364.0,-12.0,-1.0
4,ID_00BRP63,1,20,3,0,0,0,158,74,9,1,0.0,0.0,0.0,-124.0,-5.0,-1.0


In [85]:
df_train.to_csv('train.csv')

## Test

### Load data

In [86]:
df_test = pd.read_csv(path/'Test.csv')

In [87]:
df_test.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName
0,ID_01QM0NU,2010-10-23,2011-10-22,Female,46,2010-10-23,1,,,Ford,Abuja Municipal,Abuja-Municipal,Car Classic
1,ID_024NJLZ,2010-10-14,2011-10-13,Male,32,2010-10-14,1,,,,Kosofe,Benue,Car Classic
2,ID_02NOVWQ,2010-08-29,2011-08-28,Female,45,2010-08-29,2,Saloon,Black,Honda,Wuse 11,Abuja,Car Classic
3,ID_02VSP68,2010-06-13,2011-06-12,Female,58,2010-06-13,1,Saloon,,TOYOTA,,,CarSafe
4,ID_02YB37K,2010-07-01,2011-06-30,,120,2010-07-01,1,Saloon,Red,Hyundai,Victoria Island,Lagos,Car Classic


In [88]:
df_test.shape

(1202, 13)

In [89]:
df_test.describe()

Unnamed: 0,Age,No_Pol
count,1202.0,1202.0
mean,43.79285,1.2579
std,19.98624,0.61351
min,-26.0,1.0
25%,35.0,1.0
50%,41.0,1.0
75%,50.0,1.0
max,120.0,7.0


In [90]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1202 entries, 0 to 1201
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      1202 non-null   object
 1   Policy Start Date       1202 non-null   object
 2   Policy End Date         1202 non-null   object
 3   Gender                  1161 non-null   object
 4   Age                     1202 non-null   int64 
 5   First Transaction Date  1202 non-null   object
 6   No_Pol                  1202 non-null   int64 
 7   Car_Category            830 non-null    object
 8   Subject_Car_Colour      505 non-null    object
 9   Subject_Car_Make        954 non-null    object
 10  LGA_Name                546 non-null    object
 11  State                   546 non-null    object
 12  ProductName             1202 non-null   object
dtypes: int64(2), object(11)
memory usage: 122.2+ KB


### Data preparation

In [91]:
df_test['Policy Start Date'] = pd.to_datetime(df_test['Policy Start Date'] )
df_test['Policy End Date'] = pd.to_datetime(df_test['Policy End Date'] )
df_test['First Transaction Date'] = pd.to_datetime(df_test['First Transaction Date'] )

In [92]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1202 entries, 0 to 1201
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   ID                      1202 non-null   object        
 1   Policy Start Date       1202 non-null   datetime64[ns]
 2   Policy End Date         1202 non-null   datetime64[ns]
 3   Gender                  1161 non-null   object        
 4   Age                     1202 non-null   int64         
 5   First Transaction Date  1202 non-null   datetime64[ns]
 6   No_Pol                  1202 non-null   int64         
 7   Car_Category            830 non-null    object        
 8   Subject_Car_Colour      505 non-null    object        
 9   Subject_Car_Make        954 non-null    object        
 10  LGA_Name                546 non-null    object        
 11  State                   546 non-null    object        
 12  ProductName             1202 non-null   object  

In [93]:
df_test.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName
0,ID_01QM0NU,2010-10-23,2011-10-22,Female,46,2010-10-23,1,,,Ford,Abuja Municipal,Abuja-Municipal,Car Classic
1,ID_024NJLZ,2010-10-14,2011-10-13,Male,32,2010-10-14,1,,,,Kosofe,Benue,Car Classic
2,ID_02NOVWQ,2010-08-29,2011-08-28,Female,45,2010-08-29,2,Saloon,Black,Honda,Wuse 11,Abuja,Car Classic
3,ID_02VSP68,2010-06-13,2011-06-12,Female,58,2010-06-13,1,Saloon,,TOYOTA,,,CarSafe
4,ID_02YB37K,2010-07-01,2011-06-30,,120,2010-07-01,1,Saloon,Red,Hyundai,Victoria Island,Lagos,Car Classic


#### Adding timedelta

In [94]:
timedelta_with_general_date(df_test, 'Policy Start Date',drop = False)

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,Diff_daysPolicy Start Date_bt_First Transaction Date,Diff_monthPolicy Start Date_bt_First Transaction Date,Diff_yearPolicy Start Date_bt_First Transaction Date,Diff_daysPolicy Start Date_bt_Policy End Date,Diff_monthPolicy Start Date_bt_Policy End Date,Diff_yearPolicy Start Date_bt_Policy End Date
0,ID_01QM0NU,2010-10-23,2011-10-22,Female,46,2010-10-23,1,,,Ford,Abuja Municipal,Abuja-Municipal,Car Classic,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
1,ID_024NJLZ,2010-10-14,2011-10-13,Male,32,2010-10-14,1,,,,Kosofe,Benue,Car Classic,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
2,ID_02NOVWQ,2010-08-29,2011-08-28,Female,45,2010-08-29,2,Saloon,Black,Honda,Wuse 11,Abuja,Car Classic,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
3,ID_02VSP68,2010-06-13,2011-06-12,Female,58,2010-06-13,1,Saloon,,TOYOTA,,,CarSafe,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
4,ID_02YB37K,2010-07-01,2011-06-30,,120,2010-07-01,1,Saloon,Red,Hyundai,Victoria Island,Lagos,Car Classic,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197,ID_ZTTHC5X,2010-12-05,2011-12-04,Male,67,2010-12-05,2,JEEP,Black,TOYOTA,Victoria Island,Lagos,Car Classic,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
1198,ID_ZUJAFUP,2010-01-14,2011-01-13,Male,43,2010-01-14,1,Saloon,Silver,Hyundai,Surulere,Lagos,Car Classic,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
1199,ID_ZWHCTUM,2010-07-26,2011-07-25,Male,30,2010-07-26,1,Truck,White,Iveco,Victoria Island,Lagos,CVTP,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000
1200,ID_ZWQRL8L,2010-02-16,2011-02-15,Male,44,2010-02-16,2,Saloon,,Nissan,Aba North,Aba-North,Car Classic,0.00000,0.00000,0.00000,-364.00000,-12.00000,-1.00000


In [95]:
date_cols = df_test.select_dtypes(include = ['datetime64[ns]']).columns

In [96]:
date_cols

Index(['Policy Start Date', 'Policy End Date', 'First Transaction Date'], dtype='object')

In [97]:
df_test.drop(columns = date_cols, inplace = True)

#### Encode categoricals

In [98]:
with open(path/'CatPrep.pkl', 'rb') as f:
    EncodeCategorical = pickle.load(f)

In [100]:
df_test.drop(columns ='ID') .select_dtypes(include='object').head()

Unnamed: 0,Gender,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName
0,Female,,,Ford,Abuja Municipal,Abuja-Municipal,Car Classic
1,Male,,,,Kosofe,Benue,Car Classic
2,Female,Saloon,Black,Honda,Wuse 11,Abuja,Car Classic
3,Female,Saloon,,TOYOTA,,,CarSafe
4,,Saloon,Red,Hyundai,Victoria Island,Lagos,Car Classic


In [101]:
for i in list(EncodeCategorical.categories):
    if i not in df_test.columns:
        EncodeCategorical.categories.pop(i)
EncodeCategorical.cat_names = list(EncodeCategorical.categories)

In [102]:
EncodeCategorical(df_test,test=True)

In [104]:
df_test[cat_cols].head()

Unnamed: 0,Gender,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName
0,2,0,0,22,17,8,2
1,4,0,0,0,154,24,2
2,2,9,5,27,253,7,2
3,2,9,0,68,0,0,6
4,0,9,33,30,248,74,2


#### Fixmissing

In [106]:
# with open(path/'FixMiss.pkl', 'rb') as f:
#     FixMiss = pickle.load(f)

In [107]:
df_test.isnull().sum().sum()

0

#### Save ready_test

In [108]:
df_test.reset_index(inplace = True, drop = True)

In [109]:
df_test.head()

Unnamed: 0,ID,Gender,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,Diff_daysPolicy Start Date_bt_First Transaction Date,Diff_monthPolicy Start Date_bt_First Transaction Date,Diff_yearPolicy Start Date_bt_First Transaction Date,Diff_daysPolicy Start Date_bt_Policy End Date,Diff_monthPolicy Start Date_bt_Policy End Date,Diff_yearPolicy Start Date_bt_Policy End Date
0,ID_01QM0NU,2,46,1,0,0,22,17,8,2,0.0,0.0,0.0,-364.0,-12.0,-1.0
1,ID_024NJLZ,4,32,1,0,0,0,154,24,2,0.0,0.0,0.0,-364.0,-12.0,-1.0
2,ID_02NOVWQ,2,45,2,9,5,27,253,7,2,0.0,0.0,0.0,-364.0,-12.0,-1.0
3,ID_02VSP68,2,58,1,9,0,68,0,0,6,0.0,0.0,0.0,-364.0,-12.0,-1.0
4,ID_02YB37K,0,120,1,9,33,30,248,74,2,0.0,0.0,0.0,-364.0,-12.0,-1.0


In [111]:
df_test.to_csv('test.csv')

In [117]:
X_train = df_train.drop(columns = ['ID','target'], axis=1)
y_train = df_train.target

In [118]:
X_test = df_test.drop(columns = ['ID'], axis=1)

## Modelling

#### Random Forest

In [119]:
m = RandomForestClassifier(n_jobs=-1, n_estimators = 200, min_samples_leaf = 1,
                           criterion='entropy',
                           max_features= 1.0)
print(f'Training model')
m.fit(X_train, y_train)
y_pred =  m.predict(X_test)

Training model


In [122]:
df_test['target'] = y_pred

In [123]:
df_test.head()

Unnamed: 0,ID,Gender,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,Diff_daysPolicy Start Date_bt_First Transaction Date,Diff_monthPolicy Start Date_bt_First Transaction Date,Diff_yearPolicy Start Date_bt_First Transaction Date,Diff_daysPolicy Start Date_bt_Policy End Date,Diff_monthPolicy Start Date_bt_Policy End Date,Diff_yearPolicy Start Date_bt_Policy End Date,target
0,ID_01QM0NU,2,46,1,0,0,22,17,8,2,0.0,0.0,0.0,-364.0,-12.0,-1.0,0
1,ID_024NJLZ,4,32,1,0,0,0,154,24,2,0.0,0.0,0.0,-364.0,-12.0,-1.0,0
2,ID_02NOVWQ,2,45,2,9,5,27,253,7,2,0.0,0.0,0.0,-364.0,-12.0,-1.0,0
3,ID_02VSP68,2,58,1,9,0,68,0,0,6,0.0,0.0,0.0,-364.0,-12.0,-1.0,0
4,ID_02YB37K,0,120,1,9,33,30,248,74,2,0.0,0.0,0.0,-364.0,-12.0,-1.0,0


In [124]:
df_test.target.value_counts()

0    1167
1      35
Name: target, dtype: int64

In [127]:
df_test[['ID','target']].to_csv('28042021_v1.csv', index=False)

##### with default params

In [131]:
m = RandomForestClassifier(n_jobs=-1, n_estimators = 200)
print(f'Training model')
m.fit(X_train, y_train)
y_pred =  m.predict(X_test)

Training model


In [132]:
df_test['target'] = y_pred

In [133]:
df_test.target.value_counts()

0    1177
1      25
Name: target, dtype: int64

In [134]:
df_test[['ID','target']].to_csv('29042021_v1.csv', index=False)

#### Balanced RF

In [128]:
m = BalancedRandomForestClassifier(n_jobs=-1, n_estimators = 200, min_samples_leaf = 1,
                           criterion='entropy',
                           max_features= 1.0)
print(f'Training model')
m.fit(X_train, y_train)
y_pred =  m.predict(X_test)

Training model


In [129]:
df_test['target'] = y_pred

In [130]:
df_test[['ID','target']].to_csv('28042021_v2.csv', index=False)

##### with def-t params

In [136]:
m = BalancedRandomForestClassifier(n_jobs=-1, n_estimators = 200,
                           criterion='entropy')
print(f'Training model')
m.fit(X_train, y_train)
y_pred =  m.predict(X_test)

Training model


In [137]:
df_test['target'] = y_pred

In [138]:
df_test.target.value_counts()

0    719
1    483
Name: target, dtype: int64

In [139]:
df_test[['ID','target']].to_csv('29042021_v2.csv', index=False)