In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics import confusion_matrix, classification_report
from scipy.io import loadmat
import scipy
import matplotlib.mlab as mlab

%matplotlib inline

In [None]:
data = loadmat('data/ex8data1.mat')
df = pd.DataFrame(data['X'], columns=['latency', 'throughput'])
df.head()

In [None]:
df.plot(kind='scatter', x='latency', y='throughput')

In [None]:
print(df.mean(axis=0), "\n")
print(df.var(axis=0))

In [None]:
# Estimate the mean and variance for each of our factors and param that into our scipy.stats.norm() distribution
norm_estimations = [scipy.stats.norm(mu, variance) for mu, variance in zip(df.mean(axis=0), df.var(axis=0))]

In [None]:
# Let's take a quick look at latency's gaussian density estimation
a = np.linspace(8, 20)
plt.plot(a, norm_estimations[0].pdf(a))

In [None]:
df['p'] = norm_estimations[0].pdf(df.latency) * norm_estimations[1].pdf(df.throughput)
df.sort_values('p', ascending=True).head()

In [None]:
df.plot(kind='scatter', x='latency', y='throughput')

In [None]:
# Reference code: http://matplotlib.org/examples/pylab_examples/contour_demo.html

delta = 0.025
#x = np.arange(df.latency.mean()-2*df.latency.std(), df.latency.mean()+2*df.latency.std(), delta)
#y = np.arange(df.throughput.mean()-2*df.throughput.std(), df.throughput.mean()+2*df.throughput.std(), delta)

x = np.arange(df.latency.min()-1, df.latency.max()+1, delta)
y = np.arange(df.throughput.min()-1, df.throughput.max()+1, delta)

X, Y = np.meshgrid(x, y)
Z = mlab.bivariate_normal(X, Y, df.std(axis=0)[0], df.std(axis=0)[1], df.mean(axis=0)[0], df.mean(axis=0)[1])

fig, ax = plt.subplots(figsize=(12,8))
CS = plt.contour(X, Y, Z)
#plt.clabel(CS, inline=1, fontsize=10)
df.plot(kind='scatter', x='latency', y='throughput', ax=ax)
plt.title('Bivariate Normal of Latency and Throughput')


**Thresholding**

In [None]:
x_cv = pd.DataFrame(data['Xval'], columns=['latency', 'throughput'])
y_cv = pd.DataFrame(data['yval'], columns=['label'])

In [None]:
x_cv.head()

In [None]:
y_cv.head()

In [None]:
x_cv['p'] = norm_estimations[0].pdf(x_cv.latency) * norm_estimations[1].pdf(x_cv.throughput)
x_cv.head()

In [None]:
def precision(y_pred, y_cv):
    tp = (y_pred[y_pred==0].reshape(-1,1) == y_cv.values[y_pred==0]).sum()
    fp = (y_pred==0).sum() - tp
    return tp / (tp + fp)

In [None]:
def recall(y_pred, y_cv):
    tp = (y_pred[y_pred==0].reshape(-1,1) == y_cv.values[y_pred==0]).sum()
    fn = np.sum((y_pred[y_pred==1].reshape(-1,1) != y_cv[y_pred==1]).values)
    return tp / (tp + fn)

In [None]:
def f1(y_pred, y_cv):
    prec = precision(y_pred, y_cv)
    rec = recall(y_pred, y_cv)
    
    return (2*prec*rec) / (prec+rec)

In [None]:
epsilon = 8.99e-05
#epsilon = 0.05

In [None]:
# calculate our precision, recall, and f1 rates
y_pred = (x_cv.p <= epsilon).astype(int).values
precision(y_pred, y_cv), recall(y_pred, y_cv), f1(y_pred, y_cv)

In [None]:
# use sklearn's confusion matrix function
confusion_matrix(y_pred, y_cv)

In [None]:
# how nice, sklearn even has classification metrics printed for us
print(classification_report(y_pred, y_cv))

In [None]:
# For class 1
tn = confusion_matrix(y_pred, y_cv)[0,0]
fp = confusion_matrix(y_pred, y_cv)[0,1]
fn = confusion_matrix(y_pred, y_cv)[1,0]
tp = confusion_matrix(y_pred, y_cv)[1,1]

print(fn, fp, fn, tp)
tp / (tp + fp), tp / (tp + fn)

In [None]:
# For class 0
tp = confusion_matrix(y_pred, y_cv)[0,0]
fn = confusion_matrix(y_pred, y_cv)[0,1]
fp = confusion_matrix(y_pred, y_cv)[1,0]
tn = confusion_matrix(y_pred, y_cv)[1,1]

tp / (tp + fp), tp / (tp + fn)

**Recommender System - Collaborative Filtering**

In [None]:
data = loadmat('data/ex8_movies.mat')
df_r = pd.DataFrame(data['R'])
df_y = pd.DataFrame(data['Y'])

In [None]:
print(df_r.shape)
df_r.head()

In [None]:
print(df_y.shape)
df_y.head()

In [None]:
df_y.mean(axis=0)[:10]