## Explore how the size of a grid affects the speed and accuracy of Random Forests

First create a Test and Train set for evaluation

In [2]:
import pandas as pd
data_complete = pd.read_csv("data/train.csv")

In [3]:
data_complete.describe()


Unnamed: 0,row_id,x,y,accuracy,time,place_id
count,29118021.0,29118021.0,29118021.0,29118021.0,29118021.0,29118020.0
mean,14559010.0,4.99977,5.001814,82.849125,417010.364723,5493787000.0
std,8405648.775656,2.857601,2.887505,114.751772,231176.146498,2611088000.0
min,0.0,0.0,0.0,1.0,1.0,1000016000.0
25%,7279505.0,2.5347,2.4967,27.0,203057.0,3222911000.0
50%,14559010.0,5.0091,4.9883,62.0,433922.0,5518573000.0
75%,21838515.0,7.4614,7.5103,75.0,620491.0,7764307000.0
max,29118020.0,10.0,10.0,1033.0,786239.0,9999932000.0


In [4]:
data_complete.dtypes

row_id        int64
x           float64
y           float64
accuracy      int64
time          int64
place_id      int64
dtype: object

In [4]:
from sklearn.cross_validation import train_test_split

data_complete.place_id = data_complete.place_id.astype(str)
data_complete.dtypes


row_id        int64
x           float64
y           float64
accuracy      int64
time          int64
place_id     object
dtype: object

In [6]:
data_complete.row_id.head()

0    0
1    1
2    2
3    3
4    4
Name: row_id, dtype: int64

In [5]:
features = data_complete[['x', 'y', 'accuracy', 'time']]
labels = data_complete [['place_id']]

features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.33)

In [8]:
features_train.head()

Unnamed: 0,x,y,accuracy,time
10743423,9.1939,3.3142,61,124732
9783816,4.486,2.4591,67,769076
27862384,3.1594,4.8473,63,320115
11681469,7.4505,7.5245,59,24792
21398576,9.2399,3.7571,44,303514


In [9]:
labels_train.head()

Unnamed: 0,place_id
10743423,2150384489
9783816,5230331427
27862384,7862726917
11681469,5639802423
21398576,5448746251


In [10]:
labels_train.count()

place_id    19509074
dtype: int64

In [11]:
labels_test.count()

place_id    9608947
dtype: int64

In [6]:
#A function to take a x and y range and then build and test a random forest
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

def mlExplore(x_range, y_range):
    #Filter the data so that the bottom left of the grid is at 5,5
    x_filter_train = (features_train.x < 5 + x_range) & (features_train.x > 5)
    y_filter_train = (features_train.y < 5 + y_range) & (features_train.y > 5)
    filter_train = x_filter_train & y_filter_train
    
    x_filter_test = (features_test.x < 5 + x_range) & (features_test.x > 5)
    y_filter_test = (features_test.y < 5 + y_range) & (features_test.y > 5)
    filter_test = x_filter_test & y_filter_test
    
    grid_features_train = features_train[filter_train]
    grid_labels_train = labels_train[filter_train]
    
    grid_features_test = features_test[filter_test]
    grid_labels_test = labels_test[filter_test]
    
    #Record how long the machine learning takes
    time_start = time.time()
    
    #Convert the labels to an array binary array [n_obs, n_unique_labels]
    lb = preprocessing.LabelBinarizer()
    all_possible_labels= pd.concat([grid_labels_test, grid_labels_train])
    lb_fitted = lb.fit(all_possible_labels)
    grid_labels_test = lb_fitted.transform(grid_labels_test)
    grid_labels_train = lb_fitted.transform(grid_labels_train)
    
    #Count the number of points
    num_training_obs, num_labels = grid_labels_train.shape
    
    
    
    rf = RandomForestClassifier() # defualt random forest
    fitted = rf.fit(grid_features_train, grid_labels_train)
    score = fitted.score(grid_features_test, grid_labels_test)
    
    time_end = time.time()
    seconds_elapsed = time_end - time_start
    

    
    return(num_training_obs, num_labels, seconds_elapsed, score)
    

In [24]:
mlExplore(0.5, 0.01) # (x_range, y_range)

(815, 147, 0.6701650619506836, 0.1566579634464752)

In [25]:
mlExplore(0.5, 0.05)


(4265, 420, 7.1941258907318115, 0.21037601142313184)

In [26]:
mlExplore(1, 0.1)



(16934, 1030, 80.01214504241943, 0.19116447138361115)

In [27]:
mlExplore(0.2, 0.02)

(901, 126, 0.43268489837646484, 0.20634920634920634)

In [28]:
mlExplore(0.1, 0.01)

(352, 52, 0.09108400344848633, 0.26506024096385544)

In [5]:
mlExplore(2, 0.1) # Range is about 3 std each way

(35068, 1431, 264.20747089385986, 0.20591614014933945)

In [9]:
import numpy as np

for x_range in np.arange(0.1, 0.7, 0.1):
    print(x_range, mlExplore(x_range, 0.03))

0.1 (895, 120, 0.4723820686340332, 0.15555555555555556)
0.2 (1233, 174, 0.8287239074707031, 0.16718749999999999)
0.3 (1538, 213, 1.227100133895874, 0.16326530612244897)
0.4 (1809, 256, 1.9787070751190186, 0.16153846153846155)
0.5 (2258, 299, 2.6719021797180176, 0.16932624113475178)
0.6 (2587, 335, 3.608175039291382, 0.15378670788253476)


In [10]:
for y_range in np.arange(0.01, 0.05, 0.01):
    print(y_range, mlExplore(0.4, y_range))

0.01 (616, 126, 0.300570011138916, 0.15666666666666668)
0.02 (1195, 191, 0.8511979579925537, 0.17348608837970539)
0.03 (1809, 256, 1.9568290710449219, 0.16153846153846155)
0.04 (2584, 310, 3.080793857574463, 0.18706697459584296)


In [11]:
mlExplore(1, 0.05)

(8482, 638, 22.267502069473267, 0.1765834932821497)

In [12]:
mlExplore(1.5, 0.06)

(15892, 910, 62.901066064834595, 0.21716264751154438)

In [13]:
mlExplore(0.75, 0.025)

(2931, 343, 4.030368089675903, 0.1652542372881356)

In [14]:
mlExplore(1, 0.04)

(6781, 558, 15.852082014083862, 0.16706586826347306)

In [15]:
mlExplore(0.75, 0.03)

(3444, 388, 5.213081121444702, 0.17064439140811455)

Explore in more robust way
Exploratory analysis showed xspread to be 25 times the y spread so we will fix at this ration
Loop over a range of values and plot to see where score starts to break down

In [14]:
import numpy as np
xs = [0.75, 0.5, 0.25, 0.1, 0.075, 0.05, 0.04, 0.03]
results = {"range": xs, "obs": [], "places": [], "time": [], "score": []}
for x_range in xs:
    y_range = x_range / 25.0
    a,b,c,d = mlExplore(x_range, y_range)
    results["obs"].append(a)
    results["places"].append(b)
    results["time"].append(round(c,2))
    results["score"].append(round(d,2))

In [15]:
results

{'obs': [3458, 1543, 527, 132, 71, 22, 12, 6],
 'places': [388, 224, 90, 27, 17, 6, 4, 3],
 'range': [0.75, 0.5, 0.25, 0.1, 0.075, 0.05, 0.04, 0.03],
 'score': [0.17999999999999999,
  0.17000000000000001,
  0.22,
  0.32000000000000001,
  0.31,
  0.42999999999999999,
  0.33000000000000002,
  0.0],
 'time': [5.12, 1.26, 0.19, 0.04, 0.07, 0.02, 0.02, 0.02]}

In [29]:
import numpy as np
xs = [0.5,0.3, 0.2, 0.1, 0.09, 0.08, 0.07,0.06, 0.055, 0.05, 0.045]
results = {"range": xs, "obs": [], "places": [], "time": [], "score": []}
for x_range in xs:
    y_range = x_range / 25.0
    a,b,c,d = mlExplore(x_range, y_range)
    results["obs"].append(a)
    results["places"].append(b)
    results["time"].append(round(c,2))
    results["score"].append(round(float(d),2))
pd.DataFrame(results)

Unnamed: 0,obs,places,range,score,time
0,1543,224,0.5,0.18,1.28
1,679,121,0.3,0.2,0.31
2,383,71,0.2,0.24,0.12
3,132,27,0.1,0.3,0.04
4,107,24,0.09,0.29,0.03
5,84,19,0.08,0.32,0.03
6,55,14,0.07,0.38,0.02
7,32,11,0.06,0.35,0.02
8,25,9,0.055,0.23,0.02
9,22,6,0.05,0.43,0.02


Consider using lots of small square of range 0.075. Remember that the data in the table above will be 30% larger as we split into train and test

 
 Possibly create the actual evaluation metric too
 
 Possibly make the grid 3d - with accuracy as the 3rd dimension - again set the ratio of the sides of the grid equal to the ration of the features standard deviation for a particular place *Accuracy is not linearly distributed*
 
 If each grid is 0.075*0.003 then the number of grids is (10/0.075)*2 x (10/0.003)*2 = 1.8Million (Still less than the number of points? Test set has 8 Million.
 
 To execute in spark we will round the x and ys to 0.075 and 0.003 and then group the data to get 1.8Million points and associated string of test_row_ids
 
 Building 2 Million Random Forest models might be slow?

## Repeat above but time the whole function (including filtering data)

In [34]:
1e6 / 60/60

277.77777777777777

In [35]:
#A function to take a x and y range and then build and test a random forest
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

def mlExplore(x_range, y_range):
    #Filter the data so that the bottom left of the grid is at 5,5
    x_filter_test = (features_test.x < 5 + x_range) & (features_test.x > 5)
    y_filter_test = (features_test.y < 5 + y_range) & (features_test.y > 5)
    filter_test = x_filter_test & y_filter_test
    
    grid_features_test = features_test[filter_test]
    grid_labels_test = labels_test[filter_test]
    
    #Record how long the machine learning takes and filtering takes
    time_start = time.time()
    x_filter_train = (features_train.x < 5 + x_range) & (features_train.x > 5)
    y_filter_train = (features_train.y < 5 + y_range) & (features_train.y > 5)
    filter_train = x_filter_train & y_filter_train
    
    grid_features_train = features_train[filter_train]
    grid_labels_train = labels_train[filter_train]
    
   
    
    #Convert the labels to an array binary array [n_obs, n_unique_labels]
    lb = preprocessing.LabelBinarizer()
    all_possible_labels= pd.concat([grid_labels_test, grid_labels_train])
    lb_fitted = lb.fit(all_possible_labels)
    grid_labels_test = lb_fitted.transform(grid_labels_test)
    grid_labels_train = lb_fitted.transform(grid_labels_train)
    
    #Count the number of points
    num_training_obs, num_labels = grid_labels_train.shape
    
    
    
    rf = RandomForestClassifier() # defualt random forest
    fitted = rf.fit(grid_features_train, grid_labels_train)
    score = fitted.score(grid_features_test, grid_labels_test)
    
    time_end = time.time()
    seconds_elapsed = time_end - time_start
    

    
    return(num_training_obs, num_labels, seconds_elapsed, score)

xs = [0.5,0.3, 0.2, 0.1, 0.09, 0.08, 0.07,0.06, 0.055, 0.05, 0.045]
results = {"range": xs, "obs": [], "places": [], "time": [], "score": []}
for x_range in xs:
    y_range = x_range / 25.0
    a,b,c,d = mlExplore(x_range, y_range)
    results["obs"].append(a)
    results["places"].append(b)
    results["time"].append(round(c,2))
    results["score"].append(round(float(d),2))
pd.DataFrame(results)
    

Unnamed: 0,obs,places,range,score,time
0,1543,224,0.5,0.18,3.47
1,679,121,0.3,0.19,2.12
2,383,71,0.2,0.22,1.7
3,132,27,0.1,0.32,1.53
4,107,24,0.09,0.38,1.46
5,84,19,0.08,0.38,1.34
6,55,14,0.07,0.42,1.32
7,32,11,0.06,0.18,1.54
8,25,9,0.055,0.23,1.36
9,22,6,0.05,0.43,1.37


Expect each model to take about 1.5 seconds
To train 2 Million of them would take 833 hours!
Would need to distribute the problem over a few hundre cores
Would cost £40 to compute on microsoft azure

In [36]:
2e6 * 1.5 /(60*60)

833.3333333333334

### Tune parameters of Random Forest for the small tree with about 100 observations and 20 places to choose from
 First need to add in the extra variables created from time