In [2]:
import os
import json
import time
import pickle
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
CONSTANTS = {
              'redurl': "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
            , 'whiteurl': "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
            , 'redpath': "./wine_data/winequality-red.csv"
            , 'whitepath': "./wine_data/winequality-white.csv"
            , 'readme': "./wine_data/winequality.names"
    }

print(CONSTANTS['redurl'])

#The data is in a csv file but uses semicolon seperators. It also has header names embedded in the first row of the file. 
#We can use the read_csv method to easily load the data into a dataframe.
dfwine = pd.read_csv(CONSTANTS['redpath'], sep=';')
dfwine.describe()


http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [4]:
dfwine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
#Quality represents our 'expert' scores, i.e. what we are going to try to predict. 
dfwine['quality'].describe()

count    1599.000000
mean        5.636023
std         0.807569
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         8.000000
Name: quality, dtype: float64

In [6]:
#Let's see how related these are
# Create a scatter matrix of the dataframe features
#from pandas.tools.plotting import scatter_matrix
#scatter_matrix(dfwine, alpha=0.2, figsize=(12, 12), diagonal='kde')
#plt.show()

In [7]:
#It looks like our 'quality' scores are integers, so it's hard to see the trends. 
#But, it does look like there are some relationships we can use to learn from


In [8]:
#Let's prep our data
from sklearn.datasets.base import Bunch

#This function takes a dataframe of our raw data, cleans it and puts it in a Bunch
def prepdata(df):
    
    filenames = CONSTANTS
    
    #Our feature names can be pulled from our dataframe, but we don't want the last column because this is our target
    feature_names = list(df.columns.values)
    del feature_names[-1]
    target_names=["quality"]
    
    #We can use feature_names as a way to convert the dataframe to a numpy array, excluding the last column
    data = df.as_matrix(feature_names)
    #Our target is just the last column
    target = df['quality'].as_matrix()    
    
    with open(filenames['readme'], 'r') as f:
        DESCR = f.read()
    
    return Bunch(
        data=data,
        target=target,
        filenames=filenames,
        target_names=target_names,
        feature_names=feature_names,
        DESCR=DESCR
    )
dataset = prepdata(dfwine)

print(dataset.data.shape)
print(dataset.target.shape)

print(dataset['feature_names'])
print(dataset['data'])
print(dataset['target_names'])

print(dataset['target'])


(1599, 11)
(1599,)
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
[[  7.4     0.7     0.    ...,   3.51    0.56    9.4  ]
 [  7.8     0.88    0.    ...,   3.2     0.68    9.8  ]
 [  7.8     0.76    0.04  ...,   3.26    0.65    9.8  ]
 ..., 
 [  6.3     0.51    0.13  ...,   3.42    0.75   11.   ]
 [  5.9     0.645   0.12  ...,   3.57    0.71   10.2  ]
 [  6.      0.31    0.47  ...,   3.39    0.66   11.   ]]
['quality']
[5 5 5 ..., 6 5 6]


In [11]:
import builtins
from IPython.lib import deepreload
builtins.reload = deepreload.reload

In [12]:
import train_models

modeler = train_models.ManyModels()
modeler.fit(model_list=['KNeighborsClassifier', 'RandomForestClassifier'])

NameError: name 'clean_model_list' is not defined

In [8]:
#Let's train a model!

#Our measurement tools:
from sklearn import metrics
from sklearn import cross_validation
from sklearn.cross_validation import KFold

#Our model:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

models = {}

def train_model(dataset, model, label,  **kwargs):
    
    scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}
        
    for train, test in KFold(dataset.data.shape[0], n_folds=12, shuffle=True):
        X_train, X_test = dataset.data[train], dataset.data[test]
        y_train, y_test = dataset.target[train], dataset.target[test]
        
        estimator = model(**kwargs)
        estimator.fit(dataset['data'], dataset['target'])

        expected  = y_test
        predicted = estimator.predict(X_test)
        
        # Append our scores to the tracker
        scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
        scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
        scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
        scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))
        
    print("Validation scores for {}:\n", label)
    print(pd.DataFrame(scores).mean())
    
    #Save the model for use outside of the function
    models[label]=estimator
    
train_model(dataset, KNeighborsClassifier, "KNeighbors", n_neighbors=12)
train_model(dataset, RandomForestClassifier, "RandomForest")

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Validation scores for {}:
 KNeighbors
accuracy     0.582856
f1           0.556645
precision    0.571131
recall       0.582856
dtype: float64
Validation scores for {}:
 RandomForest
accuracy     0.987478
f1           0.987424
precision    0.988529
recall       0.987478
dtype: float64


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [34]:
#Since this is a relatively small dataset, let's see the actual values - the actual compared to the predicted for each wine.
wine_ratings = {}
wine_ratings['actual'] = dataset.target.tolist()
wine_ratings['KNeighbors'] = models['KNeighbors'].predict(dataset.data)
wine_ratings['RandomForest'] = models['RandomForest'].predict(dataset.data)

#converting to a dataframe forces our numpy arrays and our lists into same format, and allows us to use Pandas features
wine_ratings_df = pd.DataFrame(wine_ratings)
sort_order = ['actual','KNeighbors','RandomForest']
sorted_df = wine_ratings_df.sort(columns=sort_order)

sorted_df.head(10)



Unnamed: 0,KNeighbors,RandomForest,actual
517,5,3,3
690,5,3,3
832,5,3,3
1299,5,3,3
1478,5,3,3
1505,5,3,3
459,6,3,3
899,6,3,3
1374,6,3,3
1469,6,3,3
