## Importing data from Analyse API

In [29]:
import requests as reqs 
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
import coremltools

```addIndependentVariables``` is a helper function to create the complete URL

In [67]:
def addIndependentVariables(url, maxQuestions, startWithAmpersand = True):
    tmp = url
    if startWithAmpersand:
        tmp += "&"
    tmp += "questions[]=Q11.1note&questions[]=Q11.2note&questions[]=Q11.3note&questions[]=Q11.4note&questions[]=Q11.5note&questions[]=Q11.9note&questions[]=Q11.10note&questions[]=Q11.11note&questions[]=Q11.12note&questions[]=Q11.13note&questions[]=Q11.14note&questions[]=Q11.15note&questions[]=Q11.34note&questions[]=Q11.35note&"
    return tmp

In [187]:
dataset_url = addIndependentVariables('http://localhost:3500/analyse/api/interviews?questions[]=Q11.globnote', 35)
dataset_url += 'output=csv'
dataset_url

'http://10.211.55.3:3500/analyse/api/interviews?questions[]=Q11.globnote&questions[]=Q11.1note&questions[]=Q11.2note&questions[]=Q11.3note&questions[]=Q11.4note&questions[]=Q11.5note&questions[]=Q11.9note&questions[]=Q11.10note&questions[]=Q11.11note&questions[]=Q11.12note&questions[]=Q11.13note&questions[]=Q11.14note&questions[]=Q11.15note&questions[]=Q11.34note&questions[]=Q11.35note&output=csv'

Next call will create the dataframe from analyse raw data 

In [141]:
df = pd.read_csv(dataset_url, low_memory=False, verbose=True)
print df.shape
print df.dtypes

Tokenization took: 3.33 ms
Type conversion took: 2.90 ms
Parser memory cleanup took: 0.01 ms
(3803, 15)
Q11.globnote    float64
Q11.1note       float64
Q11.2note       float64
Q11.3note       float64
Q11.4note       float64
Q11.5note       float64
Q11.9note       float64
Q11.10note      float64
Q11.11note      float64
Q11.12note      float64
Q11.13note      float64
Q11.14note      float64
Q11.15note      float64
Q11.34note      float64
Q11.35note      float64
dtype: object


In [142]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

Let's replace missing values with the mean of the column

In [143]:
df.replace(-888888.88, np.NaN, inplace=True)
df.replace(-999999.99, np.NaN, inplace=True)
df.fillna(df.mean(), inplace=True)
display_all(df)

Unnamed: 0,Q11.globnote,Q11.1note,Q11.2note,Q11.3note,Q11.4note,Q11.5note,Q11.9note,Q11.10note,Q11.11note,Q11.12note,Q11.13note,Q11.14note,Q11.15note,Q11.34note,Q11.35note
0,10.0,10.000000,10.000000,10.000000,10.00000,10.000000,10.0,10.0,10.0,10.0,10.0,10.0,6.684645,7.272727,5.833333
1,7.5,2.500000,2.500000,5.000000,0.00000,5.000000,7.5,7.5,7.5,7.5,7.5,7.5,6.684645,7.272727,5.833333
2,10.0,7.500000,7.500000,10.000000,10.00000,5.000000,10.0,10.0,10.0,10.0,10.0,10.0,6.684645,7.272727,5.833333
3,9.0,5.000000,7.500000,2.500000,0.00000,10.000000,10.0,7.5,7.5,10.0,10.0,10.0,6.684645,7.272727,5.833333
4,8.0,5.000000,7.500000,5.000000,0.00000,7.500000,7.5,7.5,7.5,7.5,10.0,7.5,6.684645,7.272727,5.833333
5,9.5,10.000000,5.000000,5.000000,5.00000,10.000000,10.0,10.0,10.0,10.0,7.5,10.0,6.684645,7.272727,5.833333
6,7.5,5.000000,5.000000,7.500000,2.50000,7.500000,7.5,7.5,7.5,7.5,7.5,7.5,6.684645,7.272727,5.833333
7,10.0,7.500000,7.500000,7.500000,2.50000,7.500000,10.0,10.0,10.0,10.0,10.0,10.0,6.684645,7.272727,5.833333
8,10.0,10.000000,10.000000,10.000000,10.00000,10.000000,10.0,10.0,10.0,10.0,10.0,10.0,6.684645,7.272727,5.833333
9,7.5,5.000000,5.000000,5.000000,5.00000,5.000000,7.5,7.5,7.5,7.5,7.5,7.5,6.684645,7.272727,5.833333


Let's apply some basic maths on the globnote column

In [145]:
print df.describe()

       Q11.globnote    Q11.1note    Q11.2note    Q11.3note    Q11.4note  \
count   3803.000000  3803.000000  3803.000000  3803.000000  3803.000000   
mean       8.430318     7.142557     6.280488     6.496732     4.435450   
std        2.063914     1.884080     2.417139     1.591913     2.578251   
min        0.500000     0.000000     0.000000     0.000000     0.000000   
25%        7.500000     7.142557     5.000000     6.496732     2.500000   
50%        9.500000     7.142557     6.280488     6.496732     4.435450   
75%       10.000000     7.500000     7.500000     6.496732     5.000000   
max       10.000000    10.000000    10.000000    10.000000    10.000000   

         Q11.5note    Q11.9note   Q11.10note   Q11.11note   Q11.12note  \
count  3803.000000  3803.000000  3803.000000  3803.000000  3803.000000   
mean      7.679992     8.543255     8.403234     8.171181     8.651722   
std       1.859590     2.107053     2.263825     2.392099     2.095369   
min       0.000000     0.000

In [146]:
print df.size

57045


In [147]:
#df['Q11.globnote'] = np.log(df['Q11.globnote']) => might be necessary


## Split data into training and test sets

In [148]:
y = df['Q11.globnote']
X = df.drop('Q11.globnote', axis=1)

In [149]:
print X.head() #features (independent variables)

   Q11.1note  Q11.2note  Q11.3note  Q11.4note  Q11.5note  Q11.9note  \
0       10.0       10.0       10.0       10.0       10.0       10.0   
1        2.5        2.5        5.0        0.0        5.0        7.5   
2        7.5        7.5       10.0       10.0        5.0       10.0   
3        5.0        7.5        2.5        0.0       10.0       10.0   
4        5.0        7.5        5.0        0.0        7.5        7.5   

   Q11.10note  Q11.11note  Q11.12note  Q11.13note  Q11.14note  Q11.15note  \
0        10.0        10.0        10.0        10.0        10.0    6.684645   
1         7.5         7.5         7.5         7.5         7.5    6.684645   
2        10.0        10.0        10.0        10.0        10.0    6.684645   
3         7.5         7.5        10.0        10.0        10.0    6.684645   
4         7.5         7.5         7.5        10.0         7.5    6.684645   

   Q11.34note  Q11.35note  
0    7.272727    5.833333  
1    7.272727    5.833333  
2    7.272727    5.833333 

In [150]:
print y.head() #output (dependent variable)

0    10.0
1     7.5
2    10.0
3     9.0
4     8.0
Name: Q11.globnote, dtype: float64


In [164]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123, stratify=y, shuffle=True)
print X_train.shape, X_test.shape
print y_train.shape, y_test.shape

(3422, 14) (381, 14)
(3422,) (381,)


In [165]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(X_train, y_train)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

```rmse```is a helper function to calculate the Root Mean Square Error of the regression model

In [166]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

In [167]:
print "test score:"
print m.score(X_test, y_test)
print "test rmse:"
print rmse(m.predict(X_test), y_test)

test score:
0.9973968482941529
test rmse:
0.106390308007


In [173]:
print "train score:"
print m.score(X_train, y_train)
print "test rmse:"
print rmse(m.predict(X_train), y_train)

train score:
0.9993862980795044
test rmse:
0.0510626413068


## Convert to coreML

```createIndependentVariablesNamesArray``` is a helper function to create an array of names for all independent variables

In [174]:
def createIndependentVariablesNamesArray(maxQuestions):
    names = []
    for i in range(1, maxQuestions + 1 ):
       names.append("Question" + str(i))

    return names

In [175]:
coreml_model = coremltools.converters.sklearn.convert(m,"Notes", "GlobalNote")

In [176]:
coreml_model.save('OverallRating.mlmodel')

You can manually test you ios app by trying to reproduce results from the dataframe

In [186]:
X_test.loc[1, : ]

Q11.1note     2.500000
Q11.2note     2.500000
Q11.3note     5.000000
Q11.4note     0.000000
Q11.5note     5.000000
Q11.9note     7.500000
Q11.10note    7.500000
Q11.11note    7.500000
Q11.12note    7.500000
Q11.13note    7.500000
Q11.14note    7.500000
Q11.15note    6.684645
Q11.34note    7.272727
Q11.35note    5.833333
Name: 1, dtype: float64

In [181]:
y_test.loc[1]

7.5