Challenge 1: Speech Dataset
The following lines of codes were used to test different classifiers and their parameters to find the best classifier for the data set. 
Results: Our tests suggested that the best results were obtained when combining the KNN classifier (1 neightbor) with the PCA data reduction. Since the data is relatively large, reducing it and applying a simple 'force brute' algorithm we manage to get a score of 0.7

In [2]:
!pip install openml

Collecting openml
[31m  Could not find a version that satisfies the requirement openml (from versions: )[0m
[31mNo matching distribution found for openml[0m


In [3]:
from IPython.display import HTML
HTML('''<style>html, body{overflow-y: visible !important} .CodeMirror{min-width:105% !important;} .rise-enabled .CodeMirror, .rise-enabled .output_subarea{font-size:140%; line-height:1.2; overflow: visible;} .output_subarea pre{width:110%}</style>''') # For slides

In [4]:
!pip install git+https://github.com/renatopp/liac-arff@master

Collecting git+https://github.com/renatopp/liac-arff@master
  Cloning https://github.com/renatopp/liac-arff (to master) to /tmp/pip-hal35frj-build
Installing collected packages: liac-arff
  Running setup.py install for liac-arff ... [?25l- done
[?25hSuccessfully installed liac-arff-2.2.0


In [5]:
!pip install git+https://github.com/openml/openml-python.git@develop

Collecting git+https://github.com/openml/openml-python.git@develop
  Cloning https://github.com/openml/openml-python.git (to develop) to /tmp/pip-blvf8a1t-build
Collecting mock (from openml==0.6.0)
  Downloading mock-2.0.0-py2.py3-none-any.whl (56kB)
[K    100% |████████████████████████████████| 61kB 1.9MB/s 
Collecting nose (from openml==0.6.0)
  Downloading nose-1.3.7-py3-none-any.whl (154kB)
[K    100% |████████████████████████████████| 163kB 3.1MB/s 
Collecting oslo.concurrency (from openml==0.6.0)
  Downloading oslo.concurrency-3.26.0-py2.py3-none-any.whl (41kB)
[K    100% |████████████████████████████████| 51kB 8.0MB/s 
Collecting xmltodict (from openml==0.6.0)
  Downloading xmltodict-0.11.0-py2.py3-none-any.whl
Collecting pbr>=0.11 (from mock->openml==0.6.0)
  Downloading pbr-3.1.1-py2.py3-none-any.whl (99kB)
[K    100% |████████████████████████████████| 102kB 3.5MB/s 
Collecting pandocfilters>=1.4.1 (from nbconvert->openml==0.6.0)
  Downloading pandocfilters-1.4.2.tar.gz
C

In [6]:
!pip install mglearn

Collecting mglearn
  Downloading mglearn-0.1.6.tar.gz (541kB)
[K    100% |████████████████████████████████| 542kB 1.7MB/s 
Building wheels for collected packages: mglearn
  Running setup.py bdist_wheel for mglearn ... [?25l- \ | done
[?25h  Stored in directory: /content/.cache/pip/wheels/79/8b/2b/17dcfb9c9b044b216a58daea9787a0637cb1ffc5b4c2a78e50
Successfully built mglearn
Installing collected packages: mglearn
Successfully installed mglearn-0.1.6


In [None]:
from IPython.display import set_matplotlib_formats, display, HTML
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import openml as oml
import mglearn
import os
from cycler import cycler
from pprint import pprint

set_matplotlib_formats('pdf', 'png')
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['image.cmap'] = "viridis"
plt.rcParams['image.interpolation'] = "none"
plt.rcParams['savefig.bbox'] = "tight"
#plt.rcParams['lines.linewidth'] = 1
plt.rcParams['legend.numpoints'] = 1
plt.rc('axes', prop_cycle=(cycler('color', mglearn.plot_helpers.cm_cycle.colors) +
                           cycler('linestyle', ['-', '--', ':',
                                                '-.', '--'])
                           )
       )

np.set_printoptions(precision=3, suppress=True)

pd.set_option("display.max_columns", 8)
pd.set_option('precision', 2)

np, mglearn

# Prints outputs in cells so that we don't have to write print() every time 
#InteractiveShell.ast_node_interactivity = "all"

# Matplotlib tweaks for presentations
plt.rcParams["figure.figsize"] = (5, 3)
plt.rcParams["figure.max_open_warning"] = -1
plt.rcParams['font.size'] = 8; 
plt.rcParams['lines.linewidth'] = 0.5


# Presentations
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {'width': '95%', 'height': 786, 'scroll': True, 'theme': 'solarized', 'transition': 'fade', 'overflow': 'visible', 'start_slideshow_at': 'selected'})

# Silence warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import zero_one_loss
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import openml as oml
from sklearn.neighbors import KNeighborsClassifier


speech = oml.datasets.get_dataset(40910) # Download MoneyBall data

X, y, attribute_names = speech.get_data(target=speech.default_target_attribute, return_attribute_names=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)



In [25]:
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

for g in [0.001, 0.1, 1, 10, 1000]:
     for c in [0.001, 0.1, 1, 10, 1000]:
            pipe = Pipeline([('preprocessing', StandardScaler()),('test', SelectKBest(k=7)),
                         ('classifier', SVC(C=c, gamma=g))])

            scores_svc = cross_val_score(pipe,X,y,scoring=make_scorer(roc_auc_score),n_jobs=-1,cv=10)
            print('Score SVC: {:.3f}'.format(scores_svc.mean()),'')


Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.497
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.498
Score SVC: 0.498
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500
Score SVC: 0.500


In [36]:
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

for g in [0.001, 0.1, 1, 10, 1000]:
     for c in [0.001, 0.1, 1, 10, 1000]:
            pipe = Pipeline([('preprocessing', StandardScaler()),('test', SelectKBest(k=7)),
                         ('classifier', SVC(C=c, gamma=g))])

            scores_svc = cross_val_score(pipe,X,y,scoring=make_scorer(roc_auc_score),n_jobs=-1,cv=3)
            print('Score SVC: {:.3f}'.format(scores_svc.mean()),'G={:.3f}'.format(g), 'C={:.3f}'.format(c))

Score SVC: 0.500 G=0.001 C=0.001
Score SVC: 0.500 G=0.001 C=0.100
Score SVC: 0.500 G=0.001 C=1.000
Score SVC: 0.500 G=0.001 C=10.000
Score SVC: 0.500 G=0.001 C=1000.000
Score SVC: 0.500 G=0.100 C=0.001
Score SVC: 0.500 G=0.100 C=0.100
Score SVC: 0.500 G=0.100 C=1.000
Score SVC: 0.499 G=0.100 C=10.000
Score SVC: 0.486 G=0.100 C=1000.000
Score SVC: 0.500 G=1.000 C=0.001
Score SVC: 0.500 G=1.000 C=0.100
Score SVC: 0.500 G=1.000 C=1.000
Score SVC: 0.506 G=1.000 C=10.000
Score SVC: 0.506 G=1.000 C=1000.000
Score SVC: 0.500 G=10.000 C=0.001
Score SVC: 0.500 G=10.000 C=0.100
Score SVC: 0.500 G=10.000 C=1.000
Score SVC: 0.500 G=10.000 C=10.000
Score SVC: 0.500 G=10.000 C=1000.000
Score SVC: 0.500 G=1000.000 C=0.001
Score SVC: 0.500 G=1000.000 C=0.100
Score SVC: 0.500 G=1000.000 C=1.000
Score SVC: 0.500 G=1000.000 C=10.000
Score SVC: 0.500 G=1000.000 C=1000.000


In [51]:
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.neighbors import KNeighborsClassifier


for n in [1,2,3,4,5,6,7,8,9,10,20,50,100,200,500,1000]:
            pipe = Pipeline([('preprocessing', StandardScaler()),('test', SelectKBest(k=3)),
                         ('classifier', KNeighborsClassifier(n_neighbors=n))])

            scores_svc = cross_val_score(pipe,X,y,scoring=make_scorer(roc_auc_score),n_jobs=-1,cv=10)
            print('Score: {:.3f}'.format(scores_svc.mean()),'N={:d}'.format(n))






Score: 0.516 N=1
Score: 0.500 N=2
Score: 0.499 N=3
Score: 0.500 N=4
Score: 0.500 N=5
Score: 0.500 N=6
Score: 0.500 N=7
Score: 0.500 N=8
Score: 0.500 N=9
Score: 0.500 N=10
Score: 0.500 N=20
Score: 0.500 N=50
Score: 0.500 N=100
Score: 0.500 N=200
Score: 0.500 N=500
Score: 0.500 N=1000


In [71]:
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

from sklearn.naive_bayes import GaussianNB

pipe = Pipeline([('preprocessing', StandardScaler()),
                # ('PCA', PCA(n_components=30)),
                 ('classifier', GaussianNB())

                ])

scores_svc = cross_val_score(pipe,X_test,y_test,scoring=make_scorer(roc_auc_score),n_jobs=-1,cv=10)
print('Score: {:.3f}'.format(scores_svc.mean()),'N={:d}'.format(n))




Score: 0.599 N=1000


In [40]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression


#[1,50,100,150,200,250,300,350,400]

#for k in [1,2]:
for n in np.arange(2,400,2):

#  for n in [1,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400]:
        pipe = Pipeline([('preprocessing', StandardScaler()),
                         ('PCA', PCA(n_components=n)),
                         ('classifier', KNeighborsClassifier(n_neighbors=1))])
                         #('classifier', SVC(C=.001, gamma=1000))])


                                                               

        scores_svc = cross_val_score(pipe,X_test,y_test,scoring=make_scorer(roc_auc_score),cv=10)
        print('Score: {:.3f}'.format(scores_svc.mean()),'PCA={:d}'.format(n),'K={:d}'.format(1))
      

Score: 0.489 PCA=2 K=1
Score: 0.536 PCA=4 K=1
Score: 0.535 PCA=6 K=1
Score: 0.531 PCA=8 K=1
Score: 0.575 PCA=10 K=1
Score: 0.573 PCA=12 K=1
Score: 0.596 PCA=14 K=1
Score: 0.592 PCA=16 K=1
Score: 0.596 PCA=18 K=1
Score: 0.564 PCA=20 K=1
Score: 0.640 PCA=22 K=1
Score: 0.560 PCA=24 K=1
Score: 0.631 PCA=26 K=1
Score: 0.606 PCA=28 K=1
Score: 0.602 PCA=30 K=1
Score: 0.631 PCA=32 K=1
Score: 0.579 PCA=34 K=1
Score: 0.583 PCA=36 K=1
Score: 0.563 PCA=38 K=1
Score: 0.587 PCA=40 K=1
Score: 0.634 PCA=42 K=1
Score: 0.581 PCA=44 K=1
Score: 0.638 PCA=46 K=1
Score: 0.590 PCA=48 K=1
Score: 0.578 PCA=50 K=1
Score: 0.636 PCA=52 K=1
Score: 0.607 PCA=54 K=1
Score: 0.582 PCA=56 K=1
Score: 0.609 PCA=58 K=1
Score: 0.656 PCA=60 K=1
Score: 0.617 PCA=62 K=1
Score: 0.598 PCA=64 K=1
Score: 0.588 PCA=66 K=1
Score: 0.606 PCA=68 K=1
Score: 0.610 PCA=70 K=1
Score: 0.680 PCA=72 K=1
Score: 0.637 PCA=74 K=1
Score: 0.675 PCA=76 K=1
Score: 0.655 PCA=78 K=1
Score: 0.661 PCA=80 K=1
Score: 0.656 PCA=82 K=1
Score: 0.680 PCA=84 

KeyboardInterrupt: ignored

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression



#for k in [1,2]:
for n in np.arange(2,400,10):

  for c in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
        pipe = Pipeline([('preprocessing', StandardScaler()),
                         ('PCA', PCA(n_components=n)),
                         ('classifier', LogisticRegression(C=c))])
                         #('classifier', SVC(C=.001, gamma=1000))])


                                                               

        scores_svc = cross_val_score(pipe,X_test,y_test,scoring=make_scorer(roc_auc_score),cv=10,n_jobs=-1)
        print('Score: {:.3f}'.format(scores_svc.mean()),'PCA={:d}'.format(n),'C={:.3f}'.format(c))

Score: 0.500 PCA=2 C=0.001
Score: 0.500 PCA=2 C=0.010
Score: 0.500 PCA=2 C=0.100
Score: 0.500 PCA=2 C=1.000
Score: 0.500 PCA=2 C=10.000
Score: 0.500 PCA=2 C=100.000
Score: 0.500 PCA=2 C=1000.000
Score: 0.500 PCA=12 C=0.001
Score: 0.500 PCA=12 C=0.010
Score: 0.500 PCA=12 C=0.100
Score: 0.500 PCA=12 C=1.000
Score: 0.500 PCA=12 C=10.000
Score: 0.500 PCA=12 C=100.000
Score: 0.500 PCA=12 C=1000.000
Score: 0.500 PCA=22 C=0.001
Score: 0.500 PCA=22 C=0.010
Score: 0.500 PCA=22 C=0.100
Score: 0.500 PCA=22 C=1.000
Score: 0.500 PCA=22 C=10.000
Score: 0.500 PCA=22 C=100.000
Score: 0.500 PCA=22 C=1000.000
Score: 0.500 PCA=32 C=0.001
Score: 0.500 PCA=32 C=0.010
Score: 0.500 PCA=32 C=0.100
Score: 0.500 PCA=32 C=1.000
Score: 0.500 PCA=32 C=10.000
Score: 0.500 PCA=32 C=100.000
Score: 0.500 PCA=32 C=1000.000
Score: 0.500 PCA=42 C=0.001
Score: 0.500 PCA=42 C=0.010
Score: 0.500 PCA=42 C=0.100
Score: 0.500 PCA=42 C=1.000
Score: 0.500 PCA=42 C=10.000
Score: 0.500 PCA=42 C=100.000
Score: 0.500 PCA=42 C=1000.0

In [33]:

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import svm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression


clf1 = LogisticRegression()
clf2 = RandomForestClassifier()
clf3 = GaussianNB()
clf4 = SVC(C=1, gamma=1, probability='True', max_iter=100)


eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft')
#eclf2 = eclf2.fit(X, y)

#for n in np.arange(2,400,10):

 #       pipe = Pipeline([('preprocessing', StandardScaler()),
  #                       ('pca', PCA(n_components=n)),
   #                      ('classifier', eclf2)])
    #                     #('classifier', SVC(C=.001, gamma=1000))])


                                                               

scores_svc = cross_val_score(eclf2,X_test,y_test,scoring=make_scorer(roc_auc_score),cv=10,n_jobs=-1)
print('Score: {:.3f}'.format(scores_svc.mean()),'PCA={:d}'.format(1),'C={:.3f}'.format(10))



  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Score: 0.500 PCA=1 C=10.000


In [66]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier


clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
clf4 = SVC()
clf5 = KNeighborsClassifier(n_neighbors=1)


print('10-fold cross validation:\n')

labels = ['Logistic Regression', 'Random Forest', 'Naive Bayes','SVC','KNN']

for clf, label in zip([clf1, clf2, clf3,clf4,clf5], labels):
#for G in [0.001, 0.01, 0.1, 1, 10, 100]:
    scores = model_selection.cross_val_score(clf, X, y, 
                                              cv=3, 
                                              scoring=make_scorer(roc_auc_score))
    print("Accuracy: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))

eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3,clf4,clf5], weights=[1,1,1,1,1])
scores = model_selection.cross_val_score(eclf, X, y, 
                                              cv=3, 
                                              scoring=make_scorer(roc_auc_score))
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), 'MERGED'))

10-fold cross validation:

Accuracy: 0.49 (+/- 0.01) [Logistic Regression]
Accuracy: 0.50 (+/- 0.00) [Random Forest]
Accuracy: 0.52 (+/- 0.03) [Naive Bayes]
Accuracy: 0.50 (+/- 0.00) [SVC]
Accuracy: 0.55 (+/- 0.01) [KNN]


  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [MERGED]


  if diff:


In [62]:
from mlxtend.classifier import EnsembleVoteClassifier

eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3,clf4,clf5], weights=[1,1,1,1,1])


labels = ['Logistic Regression', 'Random Forest', 'Naive Bayes','SVC' ,'KNN','Ensemble']

#for clf, label in zip([clf1, clf2, clf3, clf4, eclf], labels):
for n in np.arange(2,400,10):

    pipe = Pipeline([('preprocessing', StandardScaler()),
                         ('pca', PCA(n_components=n)),
                         ('classifier', eclf)])
                         #('classifier', SVC(C=.001, gamma=1000))])
    
    scores = model_selection.cross_val_score(pipe, X, y, 
                                              cv=3, 
                                              scoring=make_scorer(roc_auc_score))
    
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.50 (+/- 0.00) [KNN]


  if diff:
  if diff:
  if diff:


Accuracy: 0.51 (+/- 0.01) [KNN]


  if diff:


KeyboardInterrupt: ignored