# Tutorial how to use [xgboost](https://github.com/dmlc/xgboost)

In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor

In [2]:
train = pd.read_csv('bike.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
datetime      10886 non-null object
season        10886 non-null int64
holiday       10886 non-null int64
workingday    10886 non-null int64
weather       10886 non-null int64
temp          10886 non-null float64
atemp         10886 non-null float64
humidity      10886 non-null int64
windspeed     10886 non-null float64
casual        10886 non-null int64
registered    10886 non-null int64
count         10886 non-null int64
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.6+ KB


There're **10 886** observations and **12** features.

### High level
1. There are no missing values.
2. Most values are integers, few of them floats and only one an object (should be a date).
3. season, weather are categorical variables (contains for possible values - 1, 2, 3, 4)


### More detailed
1. **datetime** - hourly date + timestamp
2. **season** -  
    1 = spring  
    2 = summer  
    3 = fall  
    4 = winter  
3. **holiday** - whether the day is considered a holiday
4. **workingday** - whether the day is neither a weekend nor holiday
5. **weather** -   
    1: Clear, Few clouds, Partly cloudy, Partly cloudy   
    2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist  
    3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds  
    4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog  
6. **temp** - temperature in Celsius
7. **atemp** - "feels like" temperature in Celsius
8. **humidity** - relative humidity
9. **windspeed** - wind speed
10. **casual** - number of non-registered user rentals initiated
11. **registered** - number of registered user rentals initiated
12. **count** - number of total rentals



## Target variable¶

The goal is predict - **count**
Note: **count** = **registered** + **casual**

## Quality function

$$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\ln(p_i + 1) - \ln(a_i+1))^2 }$$

where  
**n** is the number of hours in the test set  
**pi** is your predicted count  
**ai** is the actual count  
**ln(x)** is the natural logarithm  

## Simple Feature Engineering
Let's extract day in separeted column (is needed for validation as well)

In [4]:
train['datetime'] = pd.to_datetime( train['datetime'] )
train['day'] = train['datetime'].map(lambda x: x.day)

## Modeling

In [48]:
def assing_test_samples(data, last_training_day=0.3, seed=1):
    days = data.day.unique()
    np.random.seed(seed)
    np.random.shuffle(days)
    test_days = days[: int(len(days) * 0.3)]
    
    data['is_test'] = data.day.isin(test_days)


def select_features(data):
    columns = data.columns[ (data.dtypes == np.int64) | (data.dtypes == np.float64) | (data.dtypes == np.bool) ].values    
    return [feat for feat in columns if feat not in ['count', 'casual', 'registered'] and 'log' not in feat ] 

def get_X_y(data, target_variable):
    features = select_features(data)
        
    X = data[features].values
    y = data[target_variable].values
    
    return X,y

def train_test_split(train, target_variable):
    df_train = train[train.is_test == False]
    df_test  = train[train.is_test == True]
    
    X_train, y_train = get_X_y(df_train, target_variable)
    X_test, y_test = get_X_y(df_test, target_variable)
    
    return X_train, X_test, y_train, y_test



def fit_and_predict(train, model, target_variable):
    X_train, X_test, y_train, y_test = train_test_split(train, target_variable)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return (y_test, y_pred)

def post_pred(y_pred):
    y_pred[y_pred < 0] = 0
    return y_pred

def rmsle(y_true, y_pred, y_pred_only_positive=True):
    if y_pred_only_positive: y_pred = post_pred(y_pred)
        
    diff = np.log(y_pred+1) - np.log(y_true+1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

##########

def count_prediction(train, model, target_variable='count'):
    (y_test, y_pred) = fit_and_predict(train, model, target_variable)

    if target_variable == 'count_log': 
        y_test = train[train.is_test == True]['count']
        y_pred = np.exp2(y_pred)
        
    return rmsle(y_test, y_pred)

    
def registered_casual_prediction(train, model):
    (_, registered_pred) = fit_and_predict(train, model, 'registered')
    (_, casual_pred) = fit_and_predict(train, model, 'casual')

    y_test = train[train.is_test == True]['count']
    y_pred = registered_pred + casual_pred
    
    return rmsle(y_test, y_pred)

def log_registered_casual_prediction(train, model):
    (_, registered_pred) = fit_and_predict(train, model, 'registered_log')
    (_, casual_pred) = fit_and_predict(train, model, 'casual_log')
   
    y_test = train[train.is_test == True]['count']
    y_pred = (np.exp2(registered_pred) - 1) + (np.exp2(casual_pred) -1)
    
    return rmsle(y_test, y_pred)
    

assing_test_samples(train)

## Dummy Regressor
The most simple one :)

In [41]:
print('dummy', count_prediction(train, DummyRegressor()))
print('xgboost', count_prediction(train, xgb.XGBRegressor()))

('dummy', 1.5758678110942455)
('xgboost', 0.71226527946687523)


## Feature engineering

Let's a bit improve quality

In [42]:
def etl_datetime(df):
    df['year'] = df['datetime'].map(lambda x: x.year)
    df['month'] = df['datetime'].map(lambda x: x.month)

    df['hour'] = df['datetime'].map(lambda x: x.hour)
    df['minute'] = df['datetime'].map(lambda x: x.minute)
    df['dayofweek'] = df['datetime'].map(lambda x: x.dayofweek)
    df['weekend'] = df['datetime'].map(lambda x: x.dayofweek in [5,6])

    
etl_datetime(train)

In [43]:
print('xgboost', count_prediction(train, xgb.XGBRegressor()))

('xgboost', 0.71226527946687523)


## Predict count = register + casual

In [44]:
print('xgboost', registered_casual_prediction(train, xgb.XGBRegressor()))

('xgboost', 0.73669841949326831)


## Logarithm combination - count_log, registered_log, casual_log

In [45]:
train['{0}_log'.format('count')] = train['count'].map(lambda x: np.log2(x) )

for name in ['registered', 'casual']:
    train['{0}_log'.format(name)] = train[name].map(lambda x: np.log2(x+1) )

### Predict count = exp(count_log)

In [49]:
print('xgboost', count_prediction(train, xgb.XGBRegressor(), 'count_log'))

('xgboost', 0.41322977213972051)


### Predict count = exp(registered_log) + exp(casual_log)

In [51]:
print('xgboost', log_registered_casual_prediction(train, xgb.XGBRegressor()))

('xgboost', 0.39339177031871342)


## Compare with other models
- DecisionTreeRegressor
- RandomForestRegressor
- ExtraTreesRegressor
- GradientBoostingRegressor
- AdaBoostRegressor
- BaggingRegressor

In [74]:
models = [
    ('decision_tree', DecisionTreeRegressor()),
##put here other algorithms (mentioned above)
    
]

for model in models:
    print(model[0], log_registered_casual_prediction(train, model[1]))


('decision_tree', 0.43716176533139672)


## Tuning hyperparameters

In [71]:
for max_depth in [2, 5, 10]:
    for n_estimators in [100, 200, 300]:
        params = {'max_depth': max_depth, 'n_estimators': n_estimators}
        model = xgb.XGBRegressor(**params)
        print(params, log_registered_casual_prediction(train, model))

({'n_estimators': 100, 'max_depth': 2}, 0.46000367817350141)
({'n_estimators': 200, 'max_depth': 2}, 0.383063126639346)
({'n_estimators': 300, 'max_depth': 2}, 0.36217291062931611)
({'n_estimators': 100, 'max_depth': 5}, 0.33877146331666108)
({'n_estimators': 200, 'max_depth': 5}, 0.3329926658794557)
({'n_estimators': 300, 'max_depth': 5}, 0.33179067955100161)
({'n_estimators': 100, 'max_depth': 10}, 0.33338905009262376)
({'n_estimators': 200, 'max_depth': 10}, 0.33411251538787728)
({'n_estimators': 300, 'max_depth': 10}, 0.33430611210836408)


let's try play around with subsample, learning_rate and ohers...