# Scikit-Learn

See: http://scikit-learn.org/stable/documentation.html

*Author: Francesco Mosconi*

*Copyright &copy; 2017 CATALIT LLC*

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## 1. Read data from Files

In [None]:
df = pd.read_csv('data/geoloc.csv')
df.head()

In [None]:
df.plot(kind='scatter', c='target', x='lat', y='lon', cmap='bwr')

## 2. Define features (X) and target (y)

In [None]:
X = df[['lat', 'lon']]
y = df['target']

## 3. Train/Test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size = 0.3, random_state=0)

## 4. Fit a Decision Tree model

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=3, random_state=0)
model.fit(X_train, y_train)

## 5. Accuracy score on benchmark, train and test sets

In [None]:
bm_score = y.value_counts()[0] / len(y)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

In [None]:
print("Accuracy | Benchmark: {:0.3}, Train: {:0.3}, Test: {:0.3}".format(bm_score, train_score, test_score))

## 6. Confusion Matrix and Classification Report

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)

pd.DataFrame(cm,
             index=["Miss", "Hit"],
             columns=['pred_Miss', 'pred_Hit'])

In [None]:
print(classification_report(y_test, y_pred))

## 7. Display the tree

In [None]:
from sklearn.externals.six import StringIO
import pydotplus
from sklearn.tree import export_graphviz
from IPython.display import Image

dot_data = StringIO()  
export_graphviz(model, out_file=dot_data,  
                     feature_names=X.columns,  
                     class_names=['Miss','Hit'],  
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

## 8. Display the decision boundary

In [None]:
hticks = np.linspace(-1.5, 1.5, 101)
vticks = np.linspace(-1.5, 1.5, 101)
aa, bb = np.meshgrid(hticks, vticks)
ab = np.c_[aa.ravel(), bb.ravel()]

c = model.predict(ab)
cc = c.reshape(aa.shape)

ax = df.plot(kind='scatter', c='target', x='lat', y='lon', cmap='bwr')
ax.contourf(aa, bb, cc, cmap='bwr', alpha=0.2)

## Exercise 


Iterate and improve on the decision tree model. Now you have a basic pipeline example. How can you improve the score? Try some of the following:

1. change some of the initialization parameters of the decision tree re run the code.
    - Does the score change?
    - Does the decision boundary change?
2. try some other model like Logistic Regression, SVM, Naive Bayes or any other model you like from [here](http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html)
3. what's the highest score you can get?