# Refactor R Modeling in Python

Source: https://buzzfeednews.github.io/2017-08-spy-plane-finder/ <br>
Creator: peter.aldhous@buzzfeed.com

## Observation:

The model could not be reproduced in Python. <br>
Will require further study.

---

### 01. Import Python module

In [50]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

### 02. Read data files

In [52]:
x = pd.read_csv('~/Sandbox/DAMG7245_Assignment_01/data/raw/planes_features.csv')
y = pd.read_csv('~/Sandbox/DAMG7245_Assignment_01/data/raw/train.csv')

### 03. Merge the target variable `class` from train.csv 

In [53]:
data = pd.merge(x,y,how='inner', on='adshex')
data

Unnamed: 0,adshex,duration1,duration2,duration3,duration4,duration5,boxes1,boxes2,boxes3,boxes4,...,steer4,steer5,steer6,steer7,steer8,flights,squawk_1,observations,type,class
0,A00002,0.517241,0.103448,0.103448,0.103448,0.172414,0.862069,0.137931,0.000000,0.000000,...,0.244015,0.034070,0.202578,0.021179,0.068140,29,0,1086,SHIP,other
1,A00220,0.000000,0.254902,0.176471,0.313725,0.254902,0.058824,0.372549,0.294118,0.215686,...,0.375998,0.132030,0.120011,0.008611,0.006906,51,0,11149,RV10,other
2,A0041E,0.142857,0.285714,0.000000,0.571429,0.000000,0.285714,0.142857,0.285714,0.285714,...,0.657617,0.090498,0.078431,0.010558,0.019608,7,0,663,SR22,other
3,A00889,0.000000,0.120000,0.200000,0.080000,0.600000,0.000000,0.200000,0.120000,0.280000,...,0.814361,0.065339,0.023907,0.001276,0.001702,25,7760,11754,SR22,other
4,A008BE,0.000000,0.300000,0.200000,0.200000,0.300000,0.000000,0.300000,0.200000,0.300000,...,0.436620,0.092958,0.145070,0.001408,0.009859,10,1200,710,PA24,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592,ADD003,0.000000,0.244898,0.081633,0.122449,0.551020,0.000000,0.020408,0.326531,0.224490,...,0.625548,0.096850,0.082271,0.005097,0.002753,49,1200,9809,BE20,other
593,ADD1C7,0.162791,0.046512,0.209302,0.279070,0.302326,0.162791,0.093023,0.139535,0.302326,...,0.521264,0.025516,0.125152,0.012151,0.040097,43,0,823,BE58,other
594,ADEDB8,0.104167,0.166667,0.291667,0.145833,0.291667,0.104167,0.145833,0.187500,0.250000,...,0.356629,0.085509,0.203700,0.015827,0.013155,48,1340,4865,PA46,other
595,ADEE39,0.348485,0.121212,0.136364,0.227273,0.166667,0.424242,0.257576,0.136364,0.090909,...,0.294544,0.106229,0.193143,0.011830,0.022211,66,0,4142,C172,other


### 04. Check for NaN values

In [5]:
data.isna().sum()

adshex          0
duration1       0
duration2       0
duration3       0
duration4       0
duration5       0
boxes1          0
boxes2          0
boxes3          0
boxes4          0
boxes5          0
speed1          0
speed2          0
speed3          0
speed4          0
speed5          0
altitude1       0
altitude2       0
altitude3       0
altitude4       0
altitude5       0
steer1          0
steer2          0
steer3          0
steer4          0
steer5          0
steer6          0
steer7          0
steer8          0
flights         0
squawk_1        0
observations    0
type            0
class           0
dtype: int64

### 05. Check for datatypes and validate

`type` and `class` are objects, apply LabelEncoding.

In [54]:
data.dtypes

adshex           object
duration1       float64
duration2       float64
duration3       float64
duration4       float64
duration5       float64
boxes1          float64
boxes2          float64
boxes3          float64
boxes4          float64
boxes5          float64
speed1          float64
speed2          float64
speed3          float64
speed4          float64
speed5          float64
altitude1       float64
altitude2       float64
altitude3       float64
altitude4       float64
altitude5       float64
steer1          float64
steer2          float64
steer3          float64
steer4          float64
steer5          float64
steer6          float64
steer7          float64
steer8          float64
flights           int64
squawk_1          int64
observations      int64
type             object
class            object
dtype: object

### 06. Create list of independent variables 

In [55]:
col = list(data.columns)
col.remove('adshex')
col.remove('class')
print(col)

['duration1', 'duration2', 'duration3', 'duration4', 'duration5', 'boxes1', 'boxes2', 'boxes3', 'boxes4', 'boxes5', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'altitude1', 'altitude2', 'altitude3', 'altitude4', 'altitude5', 'steer1', 'steer2', 'steer3', 'steer4', 'steer5', 'steer6', 'steer7', 'steer8', 'flights', 'squawk_1', 'observations', 'type']


### 07. Apply label encoding for object dtype columns and verify

In [56]:
label_encoder = LabelEncoder()
label_encoder.fit(list(set(list(data['type'].values))))
data['type'] = label_encoder.transform(data['type']).astype(str)

In [57]:
label_encoder = LabelEncoder()
label_encoder.fit(list(set(list(y['class'].values))))
y['class'] = label_encoder.transform(y['class'])

In [58]:
data

Unnamed: 0,adshex,duration1,duration2,duration3,duration4,duration5,boxes1,boxes2,boxes3,boxes4,...,steer4,steer5,steer6,steer7,steer8,flights,squawk_1,observations,type,class
0,A00002,0.517241,0.103448,0.103448,0.103448,0.172414,0.862069,0.137931,0.000000,0.000000,...,0.244015,0.034070,0.202578,0.021179,0.068140,29,0,1086,119,other
1,A00220,0.000000,0.254902,0.176471,0.313725,0.254902,0.058824,0.372549,0.294118,0.215686,...,0.375998,0.132030,0.120011,0.008611,0.006906,51,0,11149,108,other
2,A0041E,0.142857,0.285714,0.000000,0.571429,0.000000,0.285714,0.142857,0.285714,0.285714,...,0.657617,0.090498,0.078431,0.010558,0.019608,7,0,663,121,other
3,A00889,0.000000,0.120000,0.200000,0.080000,0.600000,0.000000,0.200000,0.120000,0.280000,...,0.814361,0.065339,0.023907,0.001276,0.001702,25,7760,11754,121,other
4,A008BE,0.000000,0.300000,0.200000,0.200000,0.300000,0.000000,0.300000,0.200000,0.300000,...,0.436620,0.092958,0.145070,0.001408,0.009859,10,1200,710,97,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592,ADD003,0.000000,0.244898,0.081633,0.122449,0.551020,0.000000,0.020408,0.326531,0.224490,...,0.625548,0.096850,0.082271,0.005097,0.002753,49,1200,9809,13,other
593,ADD1C7,0.162791,0.046512,0.209302,0.279070,0.302326,0.162791,0.093023,0.139535,0.302326,...,0.521264,0.025516,0.125152,0.012151,0.040097,43,0,823,18,other
594,ADEDB8,0.104167,0.166667,0.291667,0.145833,0.291667,0.104167,0.145833,0.187500,0.250000,...,0.356629,0.085509,0.203700,0.015827,0.013155,48,1340,4865,103,other
595,ADEE39,0.348485,0.121212,0.136364,0.227273,0.166667,0.424242,0.257576,0.136364,0.090909,...,0.294544,0.106229,0.193143,0.011830,0.022211,66,0,4142,26,other


In [32]:
y

Unnamed: 0,adshex,class
0,A00C4B,1
1,A0AB21,1
2,A0AE77,1
3,A0AE7C,1
4,A0C462,1
...,...,...
592,A647B0,0
593,A40B3B,0
594,A5F2A8,0
595,AA4DA9,0


### 08. Train a model using Random Forest 

In [59]:
model = RandomForestClassifier(n_estimators=2000,criterion='entropy',max_depth=None,random_state=415)
model.fit(data[col], y['class'])

RandomForestClassifier(criterion='entropy', n_estimators=2000, random_state=415)

### 09. List the feature matrix

In [49]:
inputVariables = data[col]
sorted_idx = model.feature_importances_.argsort()
featureImp = pd.DataFrame(columns=['Variables','Importance'])
featureImp['Variables']=list(inputVariables.columns[sorted_idx])
featureImp['Importance']=list(model.feature_importances_[sorted_idx])
featureImp = featureImp.sort_values(by='Importance', ascending=False)
featureImp

Unnamed: 0,Variables,Importance
31,duration1,0.040877
30,steer7,0.036659
29,speed2,0.036198
28,steer2,0.036036
27,observations,0.035969
26,steer4,0.035914
25,speed5,0.035697
24,steer5,0.035331
23,steer3,0.033702
22,speed4,0.033475


## 10. Compare the result with R model import variables

![image](../images/plot.png)

## 11. Observation

R Model:
Important variables:
1. steer2
2. squawk_1
3. steer1

Python:
Important variables:
1. duration1
2. steer7
3. speed2

The top important variables differs in order and cannot come to a conclusion that the model is re-produced.
