# Part 1: Group Project

This is an analysis of data that is available and described [here](http://www.primaryobjects.com/2016/06/22/identifying-the-gender-of-a-voice-using-machine-learning/) and [here](https://www.kaggle.com/primaryobjects/voicegender). 

### Read in data and examine structure

In [186]:
import pandas as pd

voice_data = pd.read_csv('voice.csv')

print voice_data.shape
voice_data.head(5)

(3168, 21)


Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


### Split into features and labels

Notice that we use the `train_test_split` function *twice*: the first time is to separate a final, untouchable test set, and later, when training classifiers, we will use it a second is to split into test and validate sets. 

In [187]:
labels = voice_data['label']
features = voice_data.drop('label', 1)


print list(features.columns.values)

from sklearn.cross_validation import train_test_split
features_temp, features_test, labels_temp, labels_test = \
    train_test_split(features, labels, test_size=0.1, random_state=42)
    




['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt', 'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun', 'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx']


### Make a classifier, fit it to some training data, and examine its accuracy via validation data.

Notice that we use the `train_test_split` function *twice*: the first time is to separate a final, untouchable test set, and later, when training classifiers, we will use it a second is to split into test and validate sets. 

In [188]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

gnb = GaussianNB()
dct = DecisionTreeClassifier(random_state=1337)

clf = dct
    
from sklearn.cross_validation import train_test_split
features_train, features_val, labels_train, labels_val = \
    train_test_split(features_temp, labels_temp, test_size=0.3, random_state=42)

from time import time

#print len(features_train)
#print len(labels_train)

#print features_train[0]
#print labels_train[0]

# Train classifier on train features and train labels
# Measure and print the time taken
t0 = time()
clf = clf.fit(features_train, labels_train)
print "training time:", "\t\t", round(time()-t0, 3), "s"

pred = clf.predict(features_val)

from sklearn.metrics import confusion_matrix

print confusion_matrix(labels_val, pred, labels=["female", "male"])

# Report accuracy
from sklearn.metrics import accuracy_score
print "accuracy score:", "\t", accuracy_score(labels_val, pred)

# Report precision
#from sklearn.metrics import precision_score
#print "precision score:", "\t", precision_score(labels_val, pred)

# Report recall
#from sklearn.metrics import recall_score
#print "recall score:", "\t\t", recall_score(labels_val, pred)


# from sklearn.metrics import classification_report

# target_names = ["Not POI", "POI"]
# print "Classification Report:"
# print classification_report(y_true=labels_test, y_pred=pred, target_names=target_names)



training time: 		0.024 s
[[412  22]
 [ 17 405]]
accuracy score: 	0.954439252336


In [199]:
from sklearn.feature_selection import SelectKBest

kbest = 7

selector = SelectKBest(k=kbest)
selectedFeatures = selector.fit(features_train, labels_train)

feature_names = [list(features.columns.values)[i] for i in selectedFeatures.get_support(indices=True)]

print feature_names

kbestlabels = voice_data[feature_names]

kbestlabels.head(5)

['sd', 'Q25', 'IQR', 'sp.ent', 'sfm', 'centroid', 'meanfun']


Unnamed: 0,sd,Q25,IQR,sp.ent,sfm,centroid,meanfun
0,0.064241,0.015071,0.075122,0.893369,0.491918,0.059781,0.084279
1,0.06731,0.019414,0.073252,0.892193,0.513724,0.066009,0.107937
2,0.083829,0.008701,0.123207,0.846389,0.478905,0.077316,0.098706
3,0.072111,0.096582,0.111374,0.963322,0.727232,0.151228,0.088965
4,0.079146,0.07872,0.127325,0.971955,0.783568,0.13512,0.106398


In [207]:
t0 = time()
clf=gnb
features_train_best = features_train[['meanfun', 'meanfreq', 'Q25', 'Q75', 'median']]

clf = clf.fit(features_train_best, labels_train)

print "training time:", "\t\t", round(time()-t0, 3), "s"

pred = clf.predict(features_val[['meanfun', 'meanfreq', 'Q25', 'Q75', 'median']])

from sklearn.metrics import confusion_matrix

print confusion_matrix(labels_val, pred, labels=["female", "male"])

# Report accuracy
from sklearn.metrics import accuracy_score
print "accuracy score:", "\t", accuracy_score(labels_val, pred)

training time: 		0.004 s
[[398  36]
 [ 18 404]]
accuracy score: 	0.93691588785


# Part 2: Gaussian Distributions

In [159]:
from pylab import *
import scipy.stats
n = 100000

# A distribution
x1=array([randn(n)+1 , randn(n)+1])
y1=zeros(n)


# B distribution
x2=array([randn(n)+3 , randn(n)+3]) 
y2=zeros(n)+1
x = hstack((x1,x2)).T
y = hstack((y1,y2))

def getPredictionsKnowingFunctions(x):
    preds = zeros(n*2)
    A = scipy.stats.norm(1, 1)
    B = scipy.stats.norm(3, 1)
    i = 0
    for ar in x:
        probA = A.pdf(ar[0]) * A.pdf(ar[1])
        probB = B.pdf(ar[0]) * B.pdf(ar[1])
        if probA > probB:
            preds[i] = 0
        else:
            preds[i] = 1
        i+=1
    return preds

preds = getPredictionsKnowingFunctions(x)        

numErrors = sum(abs(y-preds))
print "Percentage Correct via ttest: " , (n*2 - numErrors) / (n*2)



Percentage Correct via ttest:  0.92025


### How does a Gaussian Naive Bayes classifier do on this data?

In [164]:
from sklearn.cross_validation import train_test_split
features_temp, features_test, labels_temp, labels_test = \
    train_test_split(features, labels, test_size=0.1, random_state=42)
    
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

gnb = GaussianNB()
dct = DecisionTreeClassifier(random_state=1337)

clf = gnb

features_temp, features_test, labels_temp, labels_test = \
    train_test_split(x, y, test_size=0.1, random_state=42)
    
from sklearn.cross_validation import train_test_split
features_train, features_val, labels_train, labels_val = \
    train_test_split(features_temp, labels_temp, test_size=0.3, random_state=42)

from time import time

#print len(features_train)
#print len(labels_train)

#print features_train[0]
#print labels_train[0]

# Train classifier on train features and train labels
# Measure and print the time taken
t0 = time()
clf = clf.fit(features_train, labels_train)
print "training time:", "\t\t", round(time()-t0, 3), "s"

pred = clf.predict(features_val)

from sklearn.metrics import confusion_matrix

print confusion_matrix(labels_val, pred)

# Report accuracy
from sklearn.metrics import accuracy_score
print "accuracy score:", "\t", accuracy_score(labels_val, pred)

training time: 		0.027 s
[[24768  2203]
 [ 2173 24856]]
accuracy score: 	0.918962962963


It seems like Gaussian Naive Bayes does so well compared to the optimal classifier that it makes the assumption of each feature representing a normal distribution with different mean/variance for each class (feature label).

In [156]:
# What is the probability density function of the square of the normal distribution function squared at 2?
x = 2
def probDensSquareNorm(y)
    return exp(-(x-1))

SyntaxError: invalid syntax (<ipython-input-156-1dbb3ac30844>, line 3)

In [96]:
from pylab import *
import scipy.stats
n = 100000

# A distribution
x1=array([randn(n)+1])
y1=zeros(n)


# B distribution
x2=array([randn(n)+3]) 
y2=zeros(n)+1
x = hstack((x1,x2)).T
y = hstack((y1,y2))

def getPredictionsKnowingFunctions(x):
    preds = zeros(n*2)
    A = scipy.stats.norm(1, 1)
    B = scipy.stats.norm(3, 1)
    i = 0
    for ar in x:
        probA = A.pdf(ar[0])
        probB = B.pdf(ar[0])
        if probA > probB:
            preds[i] = 0
        else:
            preds[i] = 1
        i+=1
    return preds

preds = getPredictionsKnowingFunctions(x)        

numErrors = sum(abs(y-preds))
print "Percentage Correct via ttest: " , (n*2 - numErrors) / (n*2)

Percentage Correct via ttest:  0.840295


In [94]:
from pylab import *
import scipy.stats
n = 5000

# A distribution
x1=array([randn(n)+1 , randn(n)+1, randn(n)+1])
y1=zeros(n)


# B distribution
x2=array([randn(n)+3 , randn(n)+3, randn(n)+3]) 
y2=zeros(n)+1
x = hstack((x1,x2)).T
y = hstack((y1,y2))

def getPredictionsKnowingFunctions(x):
    preds = zeros(n*2)
    A = scipy.stats.norm(1, 1)
    B = scipy.stats.norm(3, 1)
    i = 0
    for ar in x:
        probA = A.pdf(ar[0]) * A.pdf(ar[1]) * A.pdf(ar[2]) 
        probB = B.pdf(ar[0]) * B.pdf(ar[1]) * B.pdf(ar[2])
        if probA > probB:
            preds[i] = 0
        else:
            preds[i] = 1
        i+=1
    return preds

preds = getPredictionsKnowingFunctions(x)        

numErrors = sum(abs(y-preds))
print "Percentage Correct via ttest: " , (n*2 - numErrors) / (n*2)

Percentage Correct via ttest:  0.9574
