In [1]:
import pandas as pd
import numpy as np
import os

os.getcwd()

'/Users/toby/Documents/PycharmProjects/Kaggle/microsoft_malware_prediction'

In [2]:
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import cross_validate, GridSearchCV #Perforing grid search
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score

In [3]:
# !pip install --upgrade scikit-learn

In [4]:
# reference one:
# https://www.kaggle.com/artgor/is-this-malware-eda-fe-and-lgb-updated

### 1. Load data
Let's load data in the clean way: 1. specify their types, 2. reduce memory useage.

In [7]:
#https://www.kaggle.com/theoviel/load-the-totality-of-the-data
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [8]:
train_df = pd.read_csv('/Users/toby/Downloads/Data/microsoft_malware_prediction/train.csv', dtype=dtypes)
train_df = reduce_mem_usage(train_df)

Mem. usage decreased to 2363.03 Mb (0.0% reduction)


In [9]:
print(len(train_df))
train_df.head()

8921483


Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0000028988387b115f69f31a3bf04f09,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,0,7.0,0,,53447.0,...,36144.0,0,,0.0,0,0,0.0,0.0,10.0,0
1,000007535c3f730efa9ea0b7ef1bd645,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,0,7.0,0,,53447.0,...,57858.0,0,,0.0,0,0,0.0,0.0,8.0,0
2,000007905a28d863f6d0d597892cd692,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,0,7.0,0,,53447.0,...,52682.0,0,,0.0,0,0,0.0,0.0,3.0,0
3,00000b11598a75ea8ba1beea8459149f,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1527.0,0,7.0,0,,53447.0,...,20050.0,0,,0.0,0,0,0.0,0.0,3.0,1
4,000014a5f00daa18e76b81417eeb99fc,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1379.0,0,7.0,0,,53447.0,...,19844.0,0,0.0,0.0,0,0,0.0,0.0,1.0,1


In [10]:
train_df.describe()

Unnamed: 0,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
count,8921483.0,8889165.0,8921483.0,433438.0,8885262.0,8885262.0,8885262.0,8921483.0,8921483.0,8596074.0,...,8761350.0,8921483.0,3261780.0,8905530.0,8921483.0,8921483.0,8850140.0,8618032.0,8618032.0,8921483.0
mean,7.509962e-06,,0.01733378,,47840.02,,,0.9879711,108.049,81266.5,...,33027.93,0.4860229,0.0,0.0,0.1255431,0.03807091,,,,0.4997927
std,0.002740421,0.0,0.1305118,,14032.37,0.0,0.0,0.1090149,63.04706,48923.39,...,21206.91,0.4998046,0.0,0.0,0.3313338,0.1913675,0.0,0.0,0.0,0.5
min,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,1.0,5.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,7.0,0.0,788.0,49480.0,1.0,1.0,1.0,51.0,36825.0,...,13156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
50%,0.0,7.0,0.0,1632.0,53447.0,1.0,1.0,1.0,97.0,82373.0,...,33070.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0
75%,0.0,7.0,0.0,2372.0,53447.0,2.0,1.0,1.0,162.0,123700.0,...,52436.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,1.0
max,1.0,35.0,1.0,3212.0,70507.0,7.0,5.0,1.0,222.0,167962.0,...,72105.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,15.0,1.0


### 2. EDA Data Exploration

#### 2.1 show the distribution of each feature

We can use the code below, taking advantage of value_counts, finding the missing value percentage, and the majority population.

In [11]:
# example, show distribution of unique values under a feature (and make it to %)
train_df['SmartScreen'].value_counts(normalize=True, dropna=False) * 100

RequireAdmin    48.379658
NaN             35.610795
ExistsNotSet    11.726559
Off              2.091054
Warn             1.518615
Prompt           0.387077
Block            0.252570
off              0.015132
On               0.008194
&#x02;           0.004663
&#x01;           0.003755
on               0.001648
requireadmin     0.000112
OFF              0.000045
0                0.000034
Promt            0.000022
&#x03;           0.000011
Enabled          0.000011
prompt           0.000011
warn             0.000011
00000000         0.000011
requireAdmin     0.000011
Name: SmartScreen, dtype: float64

In [12]:
# next, we do this to each features.
stats = []
for col in train_df.columns:
    stats.append((col, train_df[col].nunique(), train_df[col].isnull().sum() * 100 / train_df.shape[0], 
                  train_df[col].value_counts(normalize=True, dropna=False).values[0] * 100, 
                  train_df[col].dtype))
    
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 
                                        'Percentage of majority population in this category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)
# stats_df
# del stats_df

Unnamed: 0,Feature,Unique_values,Percentage of missing values,Percentage of majority population in this category,type
28,PuaMode,2,99.974119,99.974119,category
41,Census_ProcessorClass,3,99.589407,99.589407,category
8,DefaultBrowsersIdentifier,1730,95.141637,95.141637,float16
68,Census_IsFlightingInternal,2,83.044030,83.044030,float16
52,Census_InternalBatteryType,78,71.046809,71.046809,category
71,Census_ThresholdOptIn,2,63.524472,63.524472,float16
75,Census_IsWIMBootEnabled,2,63.439038,63.439038,float16
31,SmartScreen,21,35.610795,48.379658,category
15,OrganizationIdentifier,49,30.841487,47.037662,float16
29,SMode,2,6.027686,93.928812,float16


#### 2.2 removing bad quality features.

In [13]:
good_cols = list(train_df.columns)
bad_removed = []
miss_removed = []
for col in train_df.columns:
    missing_rate = train_df[col].isnull().sum() / train_df.shape[0]
    rate = train_df[col].value_counts(normalize=True, dropna=False).values[0]
    # remove any column where the majority of the group is more than 90% of the population
    if rate > 0.9:
        good_cols.remove(col)
        bad_removed.append((col,rate))
        
    # remove any column that more than 80% is missing
    if missing_rate > 0.9:
        try:
            good_cols.remove(col)
            miss_removed.append((col,missing_rate))
        except:
            pass

In [14]:
# print(bad_removed)
# print(miss_removed)
for name, percent in miss_removed:
    print('feature {} is removed due to {} missing value'.format(name, round(percent, 2)))    
print("\n\n")

for name, percent in bad_removed:
    print('feature {} is removed due to majority counts for {}% population'.format(name, round(percent*100, 2)))





feature ProductName is removed due to majority counts for 98.94% population
feature IsBeta is removed due to majority counts for 100.0% population
feature RtpStateBitfield is removed due to majority counts for 96.97% population
feature IsSxsPassiveMode is removed due to majority counts for 98.27% population
feature DefaultBrowsersIdentifier is removed due to majority counts for 95.14% population
feature AVProductsEnabled is removed due to majority counts for 97.0% population
feature HasTpm is removed due to majority counts for 98.8% population
feature Platform is removed due to majority counts for 96.61% population
feature Processor is removed due to majority counts for 90.85% population
feature OsVer is removed due to majority counts for 96.76% population
feature IsProtected is removed due to majority counts for 94.18% population
feature AutoSampleOptIn is removed due to majority counts for 100.0% population
feature PuaMode is removed due to majority counts for 99.97% population
fe

In [15]:
print(good_cols)
train_df = train_df[good_cols]

['MachineIdentifier', 'EngineVersion', 'AppVersion', 'AvSigVersion', 'AVProductStatesIdentifier', 'AVProductsInstalled', 'CountryIdentifier', 'CityIdentifier', 'OrganizationIdentifier', 'GeoNameIdentifier', 'LocaleEnglishNameIdentifier', 'OsBuild', 'OsSuite', 'OsPlatformSubRelease', 'OsBuildLab', 'SkuEdition', 'IeVerIdentifier', 'SmartScreen', 'Census_MDC2FormFactor', 'Census_OEMNameIdentifier', 'Census_OEMModelIdentifier', 'Census_ProcessorCoreCount', 'Census_ProcessorManufacturerIdentifier', 'Census_ProcessorModelIdentifier', 'Census_PrimaryDiskTotalCapacity', 'Census_PrimaryDiskTypeName', 'Census_SystemVolumeTotalCapacity', 'Census_TotalPhysicalRAM', 'Census_ChassisTypeName', 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'Census_InternalPrimaryDisplayResolutionHorizontal', 'Census_InternalPrimaryDisplayResolutionVertical', 'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_InternalBatteryNumberOfCharges', 'Census_OSVersion', 'Census_OSBranch', 'Census_OSBui

In [16]:
# check out target distribution

train_df['HasDetections'].value_counts()

0    4462591
1    4458892
Name: HasDetections, dtype: int64

It turns out that we have a very evenly distributed binary classification case. This is nice.

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
train_df.head()

In [None]:
# # read testing data
# test_dtypes = {k: v for k, v in dtypes.items() if k in good_cols}
# test = pd.read_csv('../input/microsoft-malware-prediction/test.csv', dtype=test_dtypes, usecols=good_cols[:-1])
# test.loc[6529507, 'OsBuildLab'] = '17134.1.amd64fre.rs4_release.180410-1804'
# test = reduce_mem_usage(test)

### 3. Simple Trees
To see how data cleaning and engineering will effect our result, let's apply XGBoost on df with fewer columns and add more info gradually.

In [17]:
numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']

df1 = train_df.select_dtypes(include=numerics)

##### naive 1 - numerical features only and rows with missing value are dropped.
As you can see, if we drop rows with missing value, abt 90% of the data are removed.

In [18]:
print(len(df1))
df1 = df1.dropna()
print(len(df1))
df1.head()

8921483
911967


Unnamed: 0,AVProductStatesIdentifier,AVProductsInstalled,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,OsBuild,OsSuite,IeVerIdentifier,...,Census_IsFlightingInternal,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsTouchEnabled,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
5,53447.0,1.0,97,13598.0,27.0,126.0,124,17134,256,137.0,...,0.0,0.0,93.0,51039.0,0,0.0,0,0.0,15.0,1
7,53447.0,1.0,97,150323.0,27.0,126.0,124,14393,768,94.0,...,0.0,0.0,512.0,63122.0,0,0.0,0,0.0,15.0,0
30,53447.0,1.0,68,59605.0,27.0,276.0,74,14393,256,94.0,...,0.0,0.0,897.0,64640.0,0,0.0,0,1.0,12.0,0
42,53447.0,1.0,207,111352.0,27.0,277.0,75,15063,768,108.0,...,0.0,0.0,355.0,20317.0,0,0.0,0,0.0,13.0,1
58,53447.0,1.0,178,136271.0,27.0,230.0,71,9600,768,323.0,...,0.0,0.0,554.0,33076.0,1,0.0,0,0.0,1.0,0


In [19]:
X = df1.drop('HasDetections',1)
y = df1['HasDetections']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.head()

Unnamed: 0,AVProductStatesIdentifier,AVProductsInstalled,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,OsBuild,OsSuite,IeVerIdentifier,...,Census_OSUILocaleIdentifier,Census_IsFlightingInternal,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsTouchEnabled,Wdft_IsGamer,Wdft_RegionIdentifier
7883013,53447.0,1.0,51,85471.0,27.0,98.0,103,10586,768,88.0,...,26,0.0,0.0,142.0,9554.0,1,0.0,1,0.0,6.0
6600656,53386.0,2.0,43,75201.0,18.0,53.0,42,10586,768,74.0,...,158,0.0,0.0,142.0,12536.0,0,0.0,1,1.0,7.0
7410758,44095.0,2.0,80,146255.0,27.0,101.0,107,17134,768,137.0,...,28,0.0,0.0,142.0,56982.0,0,0.0,0,0.0,3.0
2782024,53447.0,1.0,169,32408.0,27.0,209.0,-77,17134,768,137.0,...,123,0.0,0.0,556.0,63424.0,1,0.0,0,0.0,3.0
1126753,7945.0,2.0,122,89520.0,27.0,89.0,88,17134,256,137.0,...,49,0.0,0.0,803.0,54535.0,0,0.0,0,0.0,11.0


In [22]:
xgb1 = XGBRegressor()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.08, .07], #so called `eta` value
              'max_depth': [5, 6],
              'min_child_weight': [3, 4],
              'silent': [0],
              'subsample': [0.7,0.75],
              'colsample_bytree': [0.7,0.71],
              'n_estimators': [600]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 3,
                        n_jobs = 2,
                        verbose=True)

xgb_grid.fit(X_train, y_train)
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[14:53:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:53:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:53:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:53:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:53:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:53:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:53:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:53:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:53:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_

[14:54:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[14:54:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[14:54:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[14:54:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[14:54:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:54:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[14:54:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:54:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[14:54:46] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_

KeyboardInterrupt: 

[14:55:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:55:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:55:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_depth=5
[14:55:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_depth=5
[14:55:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[14:55:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=5
[14:55:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[14:55:46] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:55:47] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 58 extra nodes, 0 pruned nodes, max_

[14:56:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:56:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:56:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:56:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[14:56:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[14:56:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=5
[14:56:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:56:57] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[14:56:58] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_

KeyboardInterrupt: 

KeyboardInterrupt: 

[14:57:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[14:57:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:57:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[14:57:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:57:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=5
[14:57:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[14:57:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[14:57:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=5
[14:57:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_

KeyboardInterrupt: 

In [26]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train, y_train)
predictions = gbm.predict(X_test)

In [29]:
r2_score(y_test.values, predictions)


-0.5501845695411574

In [31]:

roc_auc_score(y_test.values, predictions)


0.6131979499805501

In [28]:
# gbm.dump_model('dump.raw.txt')


AttributeError: 'XGBClassifier' object has no attribute 'dump_model'