# Random Forests

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import math
import random

%matplotlib inline
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None 

rand_seed = 123
random.seed(rand_seed)
np.random.seed(rand_seed)

In [2]:
train = pd.read_csv("train.csv")
validation = pd.read_csv("validation.csv")
test = pd.read_csv("test.csv")

## Feature Engineering

In [9]:
def UsertagCategories(df):
    
    df = df["usertag"].dropna().reset_index(drop = True)
    usertags_list = [df[i].split(",") for i in range(df.shape[0])]
    usertags = np.unique(list(itertools.chain.from_iterable(usertags_list)))
    usertags = [tag for tag in usertags if len(tag) > 0]
    
    return usertags

In [10]:
usertags_train = UsertagCategories(train)
usertags_validation = UsertagCategories(validation)

In [12]:
def FeatureEngineering(df):
    
        # Convert numerical to categorical
        df["weekday_cat"] = df["weekday"].map(lambda x: str(x))
        df["hour_cat"] = df["hour"].map(lambda x: str(x))
        df["region_cat"] = df["region"].map(lambda x: str(x))
        df["city_cat"] = df["city"].map(lambda x: str(x))
        df["adexchange_cat"] = df["adexchange"].map(lambda x: str(x))
        df["advertiser_cat"] = df["advertiser"].map(lambda x: str(x))
        
        # Operating system
        df["os"] = df["useragent"].map(lambda x: x.split("_")[0])
        
        # Browser
        df["browser"] = df["useragent"].map(lambda x: x.split("_")[1])
        
        # Slotarea
        df["slotarea"] = df["slotwidth"]*df["slotheight"]   ####.astype("category")
       
        # Slotprice binning
        df["slotprice_cat"] = 0
        
        df.loc[ df["slotprice"] <= 10, "slotprice_cat"] = 0
        df.loc[ (df["slotprice"] > 10) & (df["slotprice"] <= 50), "slotprice_cat"] = 1
        df.loc[ (df["slotprice"] > 50) & (df["slotprice"] <= 100), "slotprice_cat"] = 2
        df.loc[ df["slotprice"] > 100, "slotprice_cat"] = 3

        
        # Usertags
        
        df['usertag'] = df['usertag'].astype(str)
        for tag in usertags_train:
            col_name = "usertag_" + tag
            df[col_name] = df["usertag"].map(lambda x: 1 if tag in x.split(",") else 0)
        
  
        return df


def DropColumns(df):
    
    columns = ["weekday", "hour", "bidid", "userid", "useragent", "IP", "domain", "url", "urlid", "slotid",
               "slotwidth", "slotheight", "keypage", "usertag", "region", "city", "adexchange", "advertiser"]
    df.drop(columns, axis = 1, inplace = True)
    
    return df


def GetDummies(df):
    
    df = pd.get_dummies(df)
    return df

In [13]:
all_data = pd.concat((train, validation), axis = 0)

In [15]:
all_data = FeatureEngineering(all_data)

In [17]:
# Drop useless features
all_data = DropColumns(all_data)

In [18]:
all_data = all_data.drop(columns = ['creative'])

In [19]:
all_data.shape

(2734906, 85)

In [21]:
# Convert categorical features to numerical using dummy variables
all_data_dummy = GetDummies(all_data)

In [22]:
c = all_data_dummy.columns
c

Index(['click', 'slotprice', 'bidprice', 'payprice', 'slotarea',
       'slotprice_cat', 'usertag_10006', 'usertag_10024', 'usertag_10031',
       'usertag_10048',
       ...
       'part_of_the_day_Saturday_Night', 'part_of_the_day_Sunday_Evening',
       'part_of_the_day_Sunday_Morning', 'part_of_the_day_Sunday_Night',
       'part_of_the_day_Tuesday_Evening', 'part_of_the_day_Tuesday_Morning',
       'part_of_the_day_Tuesday_Night', 'part_of_the_day_Wednesday_Evening',
       'part_of_the_day_Wednesday_Morning', 'part_of_the_day_Wednesday_Night'],
      dtype='object', length=572)

In [23]:
all_data_dummy.head()

Unnamed: 0,click,slotprice,bidprice,payprice,slotarea,slotprice_cat,usertag_10006,usertag_10024,usertag_10031,usertag_10048,usertag_10052,usertag_10057,usertag_10059,usertag_10063,usertag_10067,usertag_10074,usertag_10075,usertag_10076,usertag_10077,usertag_10079,usertag_10083,usertag_10093,usertag_10102,usertag_10110,usertag_10111,usertag_10114,usertag_10115,usertag_10116,usertag_10117,usertag_10118,usertag_10120,usertag_10123,usertag_10125,usertag_10126,usertag_10127,usertag_10129,usertag_10130,usertag_10131,usertag_10133,usertag_10138,usertag_10140,usertag_10142,usertag_10145,usertag_10146,usertag_10147,usertag_10148,usertag_10149,usertag_10684,usertag_11092,usertag_11278,usertag_11379,usertag_11423,usertag_11512,usertag_11576,usertag_11632,usertag_11680,usertag_11724,usertag_11944,usertag_13042,usertag_13403,usertag_13496,usertag_13678,usertag_13776,usertag_13800,usertag_13866,usertag_13874,usertag_14273,usertag_15398,usertag_16593,usertag_16617,usertag_16661,usertag_16706,usertag_16751,usertag_16753,slotvisibility_0,slotvisibility_1,slotvisibility_2,slotvisibility_255,slotvisibility_FifthView,slotvisibility_FirstView,slotvisibility_FourthView,slotvisibility_Na,slotvisibility_OtherView,slotvisibility_SecondView,slotvisibility_ThirdView,slotformat_0,slotformat_1,slotformat_5,slotformat_Na,weekday_cat_0,weekday_cat_1,weekday_cat_2,weekday_cat_3,weekday_cat_4,weekday_cat_5,weekday_cat_6,hour_cat_0,hour_cat_1,hour_cat_10,hour_cat_11,hour_cat_12,hour_cat_13,hour_cat_14,hour_cat_15,hour_cat_16,hour_cat_17,hour_cat_18,hour_cat_19,hour_cat_2,hour_cat_20,hour_cat_21,hour_cat_22,hour_cat_23,hour_cat_3,hour_cat_4,hour_cat_5,hour_cat_6,hour_cat_7,hour_cat_8,hour_cat_9,region_cat_0,region_cat_1,region_cat_106,region_cat_124,region_cat_134,region_cat_146,region_cat_15,region_cat_164,region_cat_183,region_cat_2,region_cat_201,region_cat_216,region_cat_238,region_cat_253,region_cat_27,region_cat_275,region_cat_276,region_cat_298,region_cat_3,region_cat_308,region_cat_325,region_cat_333,region_cat_344,region_cat_359,region_cat_368,region_cat_374,region_cat_393,region_cat_394,region_cat_395,region_cat_40,region_cat_55,region_cat_65,region_cat_79,region_cat_80,region_cat_94,city_cat_0,city_cat_1,city_cat_10,city_cat_100,city_cat_101,city_cat_102,city_cat_103,city_cat_104,city_cat_105,city_cat_106,city_cat_107,city_cat_108,city_cat_109,city_cat_11,city_cat_110,city_cat_111,city_cat_112,city_cat_113,city_cat_114,city_cat_115,city_cat_116,city_cat_117,city_cat_118,city_cat_119,city_cat_12,city_cat_120,city_cat_121,city_cat_122,city_cat_123,city_cat_124,city_cat_125,city_cat_126,city_cat_127,city_cat_128,city_cat_129,city_cat_13,city_cat_130,city_cat_131,city_cat_132,city_cat_133,city_cat_134,city_cat_135,city_cat_136,city_cat_137,city_cat_138,city_cat_139,city_cat_14,city_cat_140,city_cat_141,city_cat_142,city_cat_143,city_cat_144,city_cat_145,city_cat_146,city_cat_147,city_cat_148,city_cat_149,city_cat_15,city_cat_150,city_cat_151,city_cat_152,city_cat_153,city_cat_154,city_cat_155,city_cat_156,city_cat_157,city_cat_158,city_cat_159,city_cat_16,city_cat_160,city_cat_161,city_cat_162,city_cat_163,city_cat_164,city_cat_165,city_cat_166,city_cat_167,city_cat_168,city_cat_169,city_cat_17,city_cat_170,city_cat_171,city_cat_172,city_cat_173,city_cat_174,city_cat_175,city_cat_176,city_cat_177,city_cat_178,city_cat_179,city_cat_18,city_cat_180,city_cat_181,city_cat_183,city_cat_184,city_cat_185,city_cat_186,city_cat_187,city_cat_188,city_cat_189,city_cat_19,city_cat_190,city_cat_191,city_cat_192,city_cat_193,city_cat_194,city_cat_195,city_cat_196,city_cat_2,city_cat_20,city_cat_201,city_cat_202,city_cat_203,city_cat_204,city_cat_205,city_cat_206,city_cat_207,city_cat_208,city_cat_209,city_cat_21,city_cat_210,city_cat_211,city_cat_212,city_cat_213,city_cat_214,city_cat_215,city_cat_216,city_cat_217,city_cat_218,city_cat_219,city_cat_22,city_cat_220,city_cat_221,city_cat_222,city_cat_223,city_cat_224,city_cat_225,city_cat_226,city_cat_227,city_cat_228,city_cat_229,city_cat_23,city_cat_230,city_cat_231,city_cat_232,city_cat_233,city_cat_234,city_cat_235,city_cat_236,city_cat_237,city_cat_238,city_cat_239,city_cat_24,city_cat_240,city_cat_241,city_cat_242,city_cat_243,city_cat_244,city_cat_245,city_cat_246,city_cat_247,city_cat_248,city_cat_249,city_cat_25,city_cat_250,city_cat_251,city_cat_252,city_cat_253,city_cat_254,city_cat_255,city_cat_26,city_cat_27,city_cat_275,city_cat_276,city_cat_277,city_cat_278,city_cat_279,city_cat_28,city_cat_280,city_cat_281,city_cat_282,city_cat_283,city_cat_284,city_cat_285,city_cat_286,city_cat_287,city_cat_288,city_cat_289,city_cat_29,city_cat_290,city_cat_291,city_cat_292,city_cat_293,city_cat_294,city_cat_295,city_cat_296,city_cat_297,city_cat_299,city_cat_3,city_cat_30,city_cat_300,city_cat_301,city_cat_302,city_cat_303,city_cat_304,city_cat_305,city_cat_306,city_cat_307,city_cat_308,city_cat_309,city_cat_31,city_cat_310,city_cat_311,city_cat_312,city_cat_313,city_cat_314,city_cat_315,city_cat_316,city_cat_317,city_cat_318,city_cat_319,city_cat_32,city_cat_320,city_cat_321,city_cat_322,city_cat_323,city_cat_324,city_cat_325,city_cat_326,city_cat_327,city_cat_328,city_cat_329,city_cat_33,city_cat_330,city_cat_331,city_cat_332,city_cat_333,city_cat_334,city_cat_335,city_cat_336,city_cat_337,city_cat_338,city_cat_339,city_cat_34,city_cat_340,city_cat_341,city_cat_342,city_cat_343,city_cat_344,city_cat_345,city_cat_346,city_cat_347,city_cat_348,city_cat_349,city_cat_35,city_cat_350,city_cat_351,city_cat_352,city_cat_353,city_cat_354,city_cat_355,city_cat_356,city_cat_357,city_cat_358,city_cat_359,city_cat_36,city_cat_360,city_cat_361,city_cat_362,city_cat_363,city_cat_364,city_cat_365,city_cat_366,city_cat_367,city_cat_368,city_cat_369,city_cat_37,city_cat_370,city_cat_371,city_cat_372,city_cat_373,city_cat_374,city_cat_375,city_cat_376,city_cat_377,city_cat_378,city_cat_379,city_cat_38,city_cat_380,city_cat_381,city_cat_382,city_cat_383,city_cat_384,city_cat_385,city_cat_386,city_cat_387,city_cat_388,city_cat_39,city_cat_393,city_cat_394,city_cat_395,city_cat_396,city_cat_397,city_cat_398,city_cat_399,city_cat_4,city_cat_40,city_cat_41,city_cat_42,city_cat_43,city_cat_44,city_cat_45,city_cat_46,city_cat_47,city_cat_48,city_cat_49,city_cat_5,city_cat_50,city_cat_51,city_cat_52,city_cat_53,city_cat_54,city_cat_56,city_cat_57,city_cat_58,city_cat_59,city_cat_6,city_cat_60,city_cat_61,city_cat_62,city_cat_63,city_cat_64,city_cat_65,city_cat_66,city_cat_67,city_cat_68,city_cat_69,city_cat_7,city_cat_70,city_cat_71,city_cat_72,city_cat_73,city_cat_74,city_cat_75,city_cat_76,city_cat_77,city_cat_78,city_cat_79,city_cat_8,city_cat_80,city_cat_81,city_cat_82,city_cat_83,city_cat_84,city_cat_85,city_cat_86,city_cat_87,city_cat_88,city_cat_89,city_cat_9,city_cat_90,city_cat_91,city_cat_92,city_cat_93,city_cat_94,city_cat_95,city_cat_96,city_cat_97,city_cat_98,city_cat_99,adexchange_cat_1.0,adexchange_cat_2.0,adexchange_cat_3.0,adexchange_cat_4.0,adexchange_cat_nan,advertiser_cat_1458,advertiser_cat_2259,advertiser_cat_2261,advertiser_cat_2821,advertiser_cat_2997,advertiser_cat_3358,advertiser_cat_3386,advertiser_cat_3427,advertiser_cat_3476,os_android,os_ios,os_linux,os_mac,os_other,os_windows,browser_chrome,browser_firefox,browser_ie,browser_maxthon,browser_opera,browser_other,browser_safari,browser_sogou,browser_theworld,part_of_the_day_Friday_Evening,part_of_the_day_Friday_Morning,part_of_the_day_Friday_Night,part_of_the_day_Monday_Evening,part_of_the_day_Monday_Morning,part_of_the_day_Monday_Night,part_of_the_day_Saturday_Evening,part_of_the_day_Saturday_Morning,part_of_the_day_Saturday_Night,part_of_the_day_Sunday_Evening,part_of_the_day_Sunday_Morning,part_of_the_day_Sunday_Night,part_of_the_day_Tuesday_Evening,part_of_the_day_Tuesday_Morning,part_of_the_day_Tuesday_Night,part_of_the_day_Wednesday_Evening,part_of_the_day_Wednesday_Morning,part_of_the_day_Wednesday_Night
0,0,5,238,5,40000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,294,23,75000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,5,238,24,62500,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,300,25,96000,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,133,277,133,65520,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
train_dummy = all_data_dummy[:train.shape[0]]
validation_dummy = all_data_dummy[train.shape[0]:]

In [25]:
train_dummy.shape, validation_dummy.shape

((2430981, 572), (303925, 572))

## Downsampling

In [30]:
train_majority = train_dummy[train_dummy.click==0]
train_minority = train_dummy[train_dummy.click==1]
train_minority.shape[0] 

1793

In [31]:
w = 0.0073 # class 0, downsampling rate

train_majority_downsampled = train_majority.sample(n = int(w * train_majority.shape[0]), 
                                                         replace=False, random_state=1)
print(len(train_majority_downsampled))

train = pd.concat([train_majority_downsampled, train_minority]).sample(frac=1)
print(len(train))

17733
19526


In [32]:
train.shape

(19526, 572)

In [33]:

X = train.loc[:, ((train.columns != "click") & 
                          (train.columns != "payprice") &
                          (train.columns != "bidprice") &
                          (train.columns != "slotprice"))].values
y = train["click"].values

X_validation = validation_dummy.loc[:, ((validation_dummy.columns != "click") & 
                                            (validation_dummy.columns != "payprice") & 
                                            (validation_dummy.columns != "bidprice") &
                                            (train.columns != "slotprice"))]
y_validation = validation_dummy["click"]
    


In [34]:
X.shape, y.shape, X_validation.shape, y_validation.shape

((19526, 568), (19526,), (303925, 568), (303925,))

In [35]:
clicks = train.loc[train["click"] == 1].shape[0]


## Random Forests


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

#### Grid-search for the best parameters

In [37]:
# rfc=RandomForestClassifier(random_state=42)

# param_grid = {    
#     'n_estimators': [100, 200, 500],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4,5,6,7,8],
#     'criterion' :['gini', 'entropy']
# }

# CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
# CV_rfc.fit(X, y)

# CV_rfc.best_params_

In [38]:
clf = RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 100, max_depth=7, criterion='entropy', 
                             class_weight= 'balanced')

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X,y)


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=7, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=None, oob_score=False,
            random_state=42, verbose=0, warm_start=False)

In [39]:
y_pred=clf.predict(X_validation)

In [40]:
from sklearn import metrics

print (metrics.classification_report(y_validation, y_pred))
print (metrics.confusion_matrix(y_validation, y_pred))
print ('PD RandomForest AU ROC: ', metrics.roc_auc_score(y_validation,  clf.predict_proba(X_validation)[:,1] ))

print("Accuracy:",metrics.accuracy_score(y_validation, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97    303723
           1       0.01      0.57      0.01       202

   micro avg       0.94      0.94      0.94    303925
   macro avg       0.50      0.76      0.49    303925
weighted avg       1.00      0.94      0.97    303925

[[286140  17583]
 [    87    115]]
PD RandomForest AU ROC:  0.8276007779756849
Accuracy: 0.9418606564119437


In [41]:
# predict pCTR for each row of the validation set

pCTR = clf.predict_proba(X_validation)[:,1]  # αν εβαζες 0 θα ηταν το συμπληρωματικο το CTR
print(pCTR)

[0.45953433 0.38414397 0.3782576  ... 0.45782526 0.41153867 0.39128649]


In [42]:
# Perform Calibration because of the initial downsampling
predicted_CTR = pCTR/(pCTR+((1-pCTR)/w))
predicted_CTR

array([0.00616858, 0.00453278, 0.00442156, ..., 0.00612653, 0.0050793 ,
       0.00467059])