In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
from sklearn import neighbors
import csv
from sklearn import datasets

# User Study to Evaluate the KnnDecisionBoundariesVisualizer in YellowBrick


In [2]:
#load data from the 
data = pd.read_csv('./examples/balavenkatesan/merged_adm_sat_data.csv')
frame = pd.DataFrame(data)

#### Function that loads college data

In [3]:
def load_adm_sat_school_data(return_X_y=False):

    """Load and return a subset of College Scorecard dataset (classification) from https://collegescorecard.ed.gov/data/.
    The method assumes the data file is the current folder where this pythong program is located.
    """
    
    with open('./examples/balavenkatesan/merged_adm_sat_data.csv') as csv_file:
        data_file = csv.reader(csv_file)
        temp = next(data_file)
        n_samples = int(temp[0])
        n_features = int(temp[1])
        target_names = np.array(temp[2:])
    
    df = pd.read_csv('./examples/balavenkatesan/merged_adm_sat_data.csv', sep=",", usecols=(0, 1, 2, 3), skiprows=0)
    data = np.empty((n_samples, n_features), dtype=int)
    target = np.ma.empty((n_samples,), dtype=int)

    for index, row in df.iterrows():
        data[index] = np.asarray([df.iloc[index][0], df.iloc[index][1], df.iloc[index][2]], dtype=np.float)
        target[index] = np.asarray(df.iloc[index][3], dtype=np.int)

    feature_names = np.array(['ACT_AVG','SAT_AVG','GRAD_DEBT','REGION'])

    if return_X_y:
        return data, target

    return datasets.base.Bunch(data=data, target=target,
                 target_names=target_names,
                 DESCR='School Data set',
                 feature_names=feature_names)

#### Function that renders a Matplotlib Scatter plot

In [4]:
def show_plot(model=neighbors.KNeighborsClassifier, n_neighbors=3, h=0.2):
    """
    A simple implementation of the scatter plot to measure performance differnece between YB & MatPlotLib
    """
    # Create color maps
    cmap_light = plt.get_cmap('Pastel1')
    cmap_bold = plt.get_cmap('Dark2')

    for weights in ['uniform', 'distance']:
        # we create an instance of Neighbours Classifier and fit the data.

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.figure()
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

        # Plot also the training points
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.title("3-Class classification (k = %i, weights = '%s')"
                  % (n_neighbors, weights))

    plt.show()

#### Loading school data and mapping to X & y

In [5]:
# school = load_adm_sat_school_data()

# X = school.data[:, :2]  # we only take the first two features.
# y = school.target

In [6]:
import yellowbrick as yb
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

In [7]:

# model.fit(X,y)
# show_plot(model,3)

In [8]:
import sys
# sys.path.append('../../yellowbrick/')

# from yellowbrick import mixins
from yellowbrick.neighbors import KnnDecisionBoundariesVisualizer



In [9]:
df = pd.read_csv('./examples/balavenkatesan/merged_adm_sat_data.csv')

In [10]:
df.head()

Unnamed: 0,ACT,SAT,AVERAGE_DEBT,REGION,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,17,827,33888.0,5,,,,,,,,
1,23,1107,21941.5,5,,,,,,,,
2,26,1219,24097.0,5,,,,,,,,
3,17,851,33118.5,5,,,,,,,,
4,25,1185,23750.0,5,,,,,,,,


In [11]:
df_limited = df[df.REGION < 5]

In [12]:
X = df_limited[['ACT', 'SAT']].as_matrix()
y = df_limited['REGION'].as_matrix()

In [15]:
model = neighbors.KNeighborsClassifier(10)
viz = KnnDecisionBoundariesVisualizer(model, features=['ACT','SAT'], classes=['one', 'two', 'three', 'four'])
viz.fit(X, y)

['one', 'two', 'three', 'four']
fitted
[ 12.   12.2  12.4  12.6  12.8  13.   13.2  13.4  13.6  13.8  14.   14.2
  14.4  14.6  14.8  15.   15.2  15.4  15.6  15.8  16.   16.2  16.4  16.6
  16.8  17.   17.2  17.4  17.6  17.8  18.   18.2  18.4  18.6  18.8  19.
  19.2  19.4  19.6  19.8  20.   20.2  20.4  20.6  20.8  21.   21.2  21.4
  21.6  21.8  22.   22.2  22.4  22.6  22.8  23.   23.2  23.4  23.6  23.8
  24.   24.2  24.4  24.6  24.8  25.   25.2  25.4  25.6  25.8  26.   26.2
  26.4  26.6  26.8  27.   27.2  27.4  27.6  27.8  28.   28.2  28.4  28.6
  28.8  29.   29.2  29.4  29.6  29.8  30.   30.2  30.4  30.6  30.8  31.
  31.2  31.4  31.6  31.8  32.   32.2  32.4  32.6  32.8  33.   33.2  33.4
  33.6  33.8  34.   34.2  34.4  34.6  34.8  35.   35.2  35.4  35.6  35.8]
meshgrid
predicted
shaped


KnnDecisionBoundariesVisualizer(classes=None,
                colors=['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c', '#cab2d6', '#6a3d9a', '#ffff99', '#b15928', '#fdbf6f', '#ff7f00'],
                features=None, model=None)

In [16]:
viz.draw(X, y)

<IPython.core.display.Javascript object>

[1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 1 1 1
 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 1 1 

IndexError: list index out of range