# Scikit-Learn

See: http://scikit-learn.org/stable/documentation.html

*Author: Francesco Mosconi*

*Copyright &copy; 2017 CATALIT LLC*

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


## Prepare data

In [None]:
df = pd.read_csv('../../../data/geoloc.csv')
X = df[['lat', 'lon']]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size = 0.3, random_state=0)

## Exercise 
1. change some of the initialization parameters of the decision tree re run the code.
    - Does the score change?
    - Does the decision boundary change?


In [None]:
def evaluate_model(model):
    
    model.fit(X_train, y_train)
    
    bm_score = y.value_counts()[0] / len(y)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    
    print("Accuracy | Benchmark: {:0.3}, Train: {:0.3}, Test: {:0.3}".format(bm_score, train_score, test_score))
    
    return model

def plot_decision_boundary(model):
    hticks = np.linspace(-1.5, 1.5, 101)
    vticks = np.linspace(-1.5, 1.5, 101)
    aa, bb = np.meshgrid(hticks, vticks)
    ab = np.c_[aa.ravel(), bb.ravel()]

    c = model.predict(ab)
    cc = c.reshape(aa.shape)

    ax = df.plot(kind='scatter', c='target', x='lat', y='lon', cmap='bwr')
    ax.contourf(aa, bb, cc, cmap='bwr', alpha=0.2)

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=5, random_state=0)

model = evaluate_model(model)

plot_decision_boundary(model)

2. try some other model like Logistic Regression, SVM, Naive Bayes or any other model you like from [here](http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html)
3. what's the highest score you can get?

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model = evaluate_model(model)
plot_decision_boundary(model)

In [None]:
from sklearn.svm import SVC

model = SVC()
model = evaluate_model(model)
plot_decision_boundary(model)

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model = evaluate_model(model)
plot_decision_boundary(model)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_jobs=-1)
model = evaluate_model(model)
plot_decision_boundary(model)

In [None]:
from sklearn.neural_network import MLPClassifier


model = MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=500)
model = evaluate_model(model)
plot_decision_boundary(model)