<a href="https://colab.research.google.com/github/fboldt/postre/blob/main/aula7a_ensembles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import random
import warnings
import numpy as np
import pandas as pd

from scipy import stats
from collections import Counter

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

### Loading Dataset
**Note:** This is a custom CSV based on [this dataset](https://www.kaggle.com/datasets/vinicius150987/titanic3/)

In [2]:
# import dataset
dataset = pd.read_csv('../Data/titanicDataSet.csv')

dataset.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S


### Cleaning the Data

In [3]:
# delete collums
dataset = dataset.drop(['ticket', 'cabin'], axis=1)

dataset.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,fare,embarked
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,211.3375,S
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,151.55,S
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,151.55,S
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,151.55,S
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,151.55,S


In [4]:
# deal with name
dataset['title'] = dataset.name.str.extract(' ([A-Za-z]+)\.', expand=False)

dataset['title'] = dataset['title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

dataset['title'] = dataset['title'].replace('Mlle', 'Miss')
dataset['title'] = dataset['title'].replace('Ms', 'Miss')
dataset['title'] = dataset['title'].replace('Mme', 'Mrs')

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

dataset['title'] = dataset['title'].map(title_mapping)
dataset['title'] = dataset['title'].fillna(0)

dataset = dataset.drop(['name'], axis=1)
  
dataset.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked,title
0,1.0,1.0,female,29.0,0.0,0.0,211.3375,S,2.0
1,1.0,1.0,male,0.9167,1.0,2.0,151.55,S,4.0
2,1.0,0.0,female,2.0,1.0,2.0,151.55,S,2.0
3,1.0,0.0,male,30.0,1.0,2.0,151.55,S,1.0
4,1.0,0.0,female,25.0,1.0,2.0,151.55,S,3.0


In [5]:
# deal with Numbers
datasetNum = dataset.select_dtypes('number')
imputer = SimpleImputer(strategy="median")
datasetNumLimpo = imputer.fit_transform(datasetNum)

columns = datasetNum.columns  # Retrieve column names
datasetNumLimpoPart = pd.DataFrame(datasetNumLimpo, columns=columns)

datasetNumLimpoPart.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,title
0,1.0,1.0,29.0,0.0,0.0,211.3375,2.0
1,1.0,1.0,0.9167,1.0,2.0,151.55,4.0
2,1.0,0.0,2.0,1.0,2.0,151.55,2.0
3,1.0,0.0,30.0,1.0,2.0,151.55,1.0
4,1.0,0.0,25.0,1.0,2.0,151.55,3.0


In [6]:
# deal with Simple Values
datasetCat = dataset.select_dtypes('object')
imputer = SimpleImputer(strategy="most_frequent")
datasetCatLimpo = imputer.fit_transform(datasetCat)

columns = datasetCat.columns  # Retrieve column names
datasetCatLimpoPart = pd.DataFrame(datasetCatLimpo, columns=columns)

datasetCatLimpoPart['sex'] = datasetCatLimpoPart['sex'].map( {'female': 1, 'male': 0} ).astype(int)
datasetCatLimpoPart['embarked'] = datasetCatLimpoPart['embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

datasetCatLimpoPart.head()

Unnamed: 0,sex,embarked
0,1,0
1,0,0
2,1,0
3,0,0
4,1,0


In [7]:
# Combine dataset back
dataset = pd.concat([datasetNumLimpoPart, datasetCatLimpoPart], axis=1)

dataset.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,title,sex,embarked
0,1.0,1.0,29.0,0.0,0.0,211.3375,2.0,1,0
1,1.0,1.0,0.9167,1.0,2.0,151.55,4.0,0,0
2,1.0,0.0,2.0,1.0,2.0,151.55,2.0,1,0
3,1.0,0.0,30.0,1.0,2.0,151.55,1.0,0,0
4,1.0,0.0,25.0,1.0,2.0,151.55,3.0,1,0


In [8]:
# Simplify age
dataset.loc[ dataset['age'] <= 16, 'age'] = 0
dataset.loc[(dataset['age'] > 16) & (dataset['age'] <= 32), 'age'] = 1
dataset.loc[(dataset['age'] > 32) & (dataset['age'] <= 48), 'age'] = 2
dataset.loc[(dataset['age'] > 48) & (dataset['age'] <= 64), 'age'] = 3
dataset.loc[ dataset['age'] > 64, 'age']

dataset.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,title,sex,embarked
0,1.0,1.0,1.0,0.0,0.0,211.3375,2.0,1,0
1,1.0,1.0,0.0,1.0,2.0,151.55,4.0,0,0
2,1.0,0.0,0.0,1.0,2.0,151.55,2.0,1,0
3,1.0,0.0,1.0,1.0,2.0,151.55,1.0,0,0
4,1.0,0.0,1.0,1.0,2.0,151.55,3.0,1,0


In [9]:
# Simplify fare
dataset.loc[ dataset['fare'] <= 7.91, 'fare'] = 0
dataset.loc[(dataset['fare'] > 7.91) & (dataset['fare'] <= 14.454), 'fare'] = 1
dataset.loc[(dataset['fare'] > 14.454) & (dataset['fare'] <= 31), 'fare']   = 2
dataset.loc[ dataset['fare'] > 31, 'fare'] = 3

dataset.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,title,sex,embarked
0,1.0,1.0,1.0,0.0,0.0,3.0,2.0,1,0
1,1.0,1.0,0.0,1.0,2.0,3.0,4.0,0,0
2,1.0,0.0,0.0,1.0,2.0,3.0,2.0,1,0
3,1.0,0.0,1.0,1.0,2.0,3.0,1.0,0,0
4,1.0,0.0,1.0,1.0,2.0,3.0,3.0,1,0


**Note:** the random numbers used here are based on a previous exercise!

### Split The Data

In [10]:
y = dataset['survived']
X = dataset.drop('survived', axis=1)

In [11]:
Xtr, Xte, ytr, yte = train_test_split(X, y, random_state=42)
Xtr.shape, Xte.shape, ytr.shape, yte.shape

((982, 8), (328, 8), (982,), (328,))

# Exercise

In [12]:
ClasifierData = {
    'classifier': [],
    'result': []
}

### Clasifiers

In [13]:
modelo = KNeighborsClassifier()
modelo.fit(Xtr, ytr)
knn_pred = modelo.predict(Xte)
knn_hits = knn_pred == yte
result = sum(knn_hits)/len(knn_hits)

ClasifierData['classifier'].append('KNeighborsClassifier')
ClasifierData['result'].append(result)

result

0.8140243902439024

In [14]:
modelo = GaussianNB()
modelo.fit(Xtr, ytr)
gnb_pred = modelo.predict(Xte)
gnb_hits = gnb_pred == yte
result = sum(gnb_hits)/len(gnb_hits)

ClasifierData['classifier'].append('GaussianNB')
ClasifierData['result'].append(result)

result

0.8079268292682927

In [15]:
modelo = Perceptron()
modelo.fit(Xtr, ytr)
per_pred = modelo.predict(Xte)
per_hits = per_pred == yte
result = sum(per_hits)/len(per_hits)

ClasifierData['classifier'].append('Perceptron')
ClasifierData['result'].append(result)

result

0.774390243902439

In [16]:
modelo = DecisionTreeClassifier()
modelo.fit(Xtr, ytr)
dtc_pred = modelo.predict(Xte)
dtc_hits = dtc_pred == yte
result = sum(dtc_hits)/len(dtc_hits)

ClasifierData['classifier'].append('DecisionTreeClassifier')
ClasifierData['result'].append(result)

result

0.774390243902439

In [17]:
scores = cross_validate(DecisionTreeClassifier(), X, y)
np.mean(scores['test_score']), scores['test_score']

(0.6427480916030535,
 array([0.51145038, 0.71374046, 0.66412214, 0.69465649, 0.62977099]))

In [18]:
modelo = DecisionTreeClassifier(splitter='random')
modelo.fit(Xtr, ytr)
dtc_pred = modelo.predict(Xte)
dtc_hits = dtc_pred == yte
result = sum(dtc_hits)/len(dtc_hits)

ClasifierData['classifier'].append('DecisionTreeClassifier (Random)')
ClasifierData['result'].append(result)

result

0.7804878048780488

In [19]:
modelo = BaggingClassifier(DecisionTreeClassifier(splitter='random'),
                           n_estimators=100, max_features=0.15, random_state=42)
modelo.fit(Xtr, ytr)
bag_pred = modelo.predict(Xte)
bag_hits = bag_pred == yte
result = sum(bag_hits)/len(bag_hits)

ClasifierData['classifier'].append('BaggingClassifier')
ClasifierData['result'].append(result)

result

0.7134146341463414

In [20]:
modelo = RandomForestClassifier(random_state=42)
modelo.fit(Xtr, ytr)
rfc_pred = modelo.predict(Xte)
rfc_hits = rfc_pred == yte
result = sum(rfc_hits)/len(rfc_hits)

ClasifierData['classifier'].append('RandomForestClassifier')
ClasifierData['result'].append(result)

result

0.7896341463414634

In [21]:
scores = cross_validate(RandomForestClassifier(random_state=42), X, y)
np.mean(scores['test_score']), scores

(0.6717557251908397,
 {'fit_time': array([0.11752105, 0.11552882, 0.11905527, 0.11954689, 0.11651969]),
  'score_time': array([0.00800371, 0.00799727, 0.00901008, 0.0080018 , 0.00699925]),
  'test_score': array([0.51145038, 0.73664122, 0.7519084 , 0.71374046, 0.64503817])})

In [22]:
modelo = ExtraTreesClassifier(random_state=42)
modelo.fit(Xtr, ytr)
etc_pred = modelo.predict(Xte)
etc_hits = etc_pred == yte
result = sum(etc_hits)/len(etc_hits)

ClasifierData['classifier'].append('ExtraTreesClassifier')
ClasifierData['result'].append(result)

result

0.7957317073170732

In [23]:
def maisFrequente(y):
  return Counter(y.flat).most_common(1)[0][0]

def melhorCaracteristica(X, y):
  caracteristica = random.randint(0, X.shape[1]-1)
  xmin = np.min(X[:,caracteristica])
  xmax = np.max(X[:,caracteristica])
  valor = random.random()*(xmax-xmin)+xmin
  return caracteristica, valor

class Arvore(BaseEstimator, ClassifierMixin):
  def fit(self, X, y):
    self.caracteristica, self.valor = melhorCaracteristica(X, y)
    maiores = X[:,self.caracteristica] > self.valor
    if sum(maiores)>0 and sum(~maiores)>0:
      self.maiores = Arvore()
      self.maiores.fit(X[maiores,:], y[maiores])
      self.menores = Arvore()
      self.menores.fit(X[~maiores,:], y[~maiores])
    else:
      self.resposta = maisFrequente(y)
    return self
  def predict(self, X, y=None):
    y = np.empty((X.shape[0]), dtype=np.int64)
    if hasattr(self, 'resposta'):
      y[:] = self.resposta
    else:
      maiores = X[:,self.caracteristica] > self.valor
      y[maiores] = self.maiores.predict(X[maiores,:])
      y[~maiores] = self.menores.predict(X[~maiores,:])
    return y

modelo = BaggingClassifier(Arvore(),
                           n_estimators=200,
                           max_features=0.1,
                           random_state=42)
modelo.fit(Xtr, ytr)
bag_pred = modelo.predict(Xte)
bag_hits = bag_pred == yte
result = sum(bag_hits)/len(bag_hits)

ClasifierData['classifier'].append('BaggingClassifier (Arvore)')
ClasifierData['result'].append(result)

result

0.7134146341463414

In [24]:
scores = cross_validate(modelo, X, y)
np.mean(scores['test_score']), scores

(0.6809160305343511,
 {'fit_time': array([0.22903705, 0.23003817, 0.23055387, 0.24004126, 0.22267914]),
  'score_time': array([0.03051329, 0.03000093, 0.0300045 , 0.02952003, 0.02900004]),
  'test_score': array([0.82442748, 0.65267176, 0.66030534, 0.63740458, 0.62977099])})

In [25]:
modelo = AdaBoostClassifier(DecisionTreeClassifier(max_depth=25, splitter='random'),
                            learning_rate=0.15, random_state=42)
modelo.fit(Xtr, ytr)
abc_pred = modelo.predict(Xte)
abc_hits = abc_pred == yte
result = sum(abc_hits)/len(abc_hits)

ClasifierData['classifier'].append('AdaBoostClassifier')
ClasifierData['result'].append(result)

result

0.7774390243902439

In [26]:
scores = cross_validate(modelo, X, y)
np.mean(scores['test_score']), scores

(0.6610687022900763,
 {'fit_time': array([0.0905273 , 0.08451819, 0.08551788, 0.08651948, 0.09504509]),
  'score_time': array([0.00700235, 0.00700116, 0.00799823, 0.00700045, 0.00700188]),
  'test_score': array([0.51145038, 0.70992366, 0.73664122, 0.70610687, 0.64122137])})

### Combination Clasifiers

In [27]:
hits = np.stack((knn_hits, gnb_hits, per_hits))
ypred = np.stack((knn_pred, gnb_pred, per_pred))

vote_pred = stats.mode(ypred)[0]
vote_hits = vote_pred == yte
result = sum(vote_hits)/len(vote_hits)

ClasifierData['classifier'].append('Stack (knn, gnb, per)')
ClasifierData['result'].append(result)

result


0.8201219512195121

In [28]:
modelo = VotingClassifier([
    ('knn', KNeighborsClassifier()),
    ('gnb', GaussianNB()),
    ('per', Perceptron())
])
modelo.fit(Xtr, ytr)
vote_pred = modelo.predict(Xte)
vote_hits = vote_pred == yte
result = sum(vote_hits)/len(vote_hits)

ClasifierData['classifier'].append('Voting (knn, gnb, per)')
ClasifierData['result'].append(result)

result

0.8201219512195121

In [29]:
modelo = VotingClassifier([
    ('knn1', KNeighborsClassifier(1)),
    ('knn5', KNeighborsClassifier(5)),
    ('knn9', KNeighborsClassifier(9)),
])
modelo.fit(Xtr, ytr)
vote_pred = modelo.predict(Xte)
vote_hits = vote_pred == yte
result = sum(vote_hits)/len(vote_hits)

ClasifierData['classifier'].append('Voting (knn1, knn5, knn9)')
ClasifierData['result'].append(result)

result

0.8079268292682927

In [30]:
modelo = VotingClassifier([
    ('knn', KNeighborsClassifier()),
    ('gnb', GaussianNB()),
    ('dtc', DecisionTreeClassifier())
])
modelo.fit(Xtr, ytr)
vote_pred = modelo.predict(Xte)
vote_hits = vote_pred == yte
result = sum(vote_hits)/len(vote_hits)

ClasifierData['classifier'].append('Voting (knn, gnb, dtc)')
ClasifierData['result'].append(result)

result

0.8140243902439024

In [31]:
modelo = VotingClassifier([
    ('knn', KNeighborsClassifier()),
    ('gnb', GaussianNB()),
    ('per', Perceptron()),
    ('dtc', DecisionTreeClassifier())
])
modelo.fit(Xtr, ytr)
vote_pred = modelo.predict(Xte)
vote_hits = vote_pred == yte
result = sum(vote_hits)/len(vote_hits)

ClasifierData['classifier'].append('Voting (knn, gnb, per, dtc)')
ClasifierData['result'].append(result)

result

0.8140243902439024

In [32]:
warnings.filterwarnings('ignore')

voting = VotingClassifier([
    ('knn', KNeighborsClassifier()),
    ('gnb', GaussianNB()),
    ('per', Perceptron())
])

modelo = StackingClassifier([
    ('voting', voting),
    ('extrat', ExtraTreesClassifier()),
    ('ranfor', RandomForestClassifier())
], cv=3, passthrough=True)

modelo.fit(Xtr, ytr)
stack_pred = modelo.predict(Xte)
stack_hits = stack_pred == yte
result = sum(stack_hits)/len(stack_hits)

ClasifierData['classifier'].append('Stacking ((knn, gnb, per), extrat, ranfor)')
ClasifierData['result'].append(result)

result

0.7926829268292683

In [33]:
scores = cross_validate(modelo, X, y)
np.mean(scores['test_score']), scores

(0.6969465648854961,
 {'fit_time': array([0.94561958, 0.93574119, 0.92598009, 0.90667486, 0.91166902]),
  'score_time': array([0.02799845, 0.0405221 , 0.03099918, 0.03051734, 0.03000164]),
  'test_score': array([0.44274809, 0.89694656, 0.80916031, 0.69083969, 0.64503817])})

### My combinations

In [34]:
warnings.filterwarnings('ignore')

modelo = VotingClassifier([
    ('knn1', KNeighborsClassifier(1)),
    ('knn5', KNeighborsClassifier(5)),
    ('knn9', KNeighborsClassifier(9)),
])

modelo = StackingClassifier([
    ('voting', voting),
    ('extrat', ExtraTreesClassifier()),
    ('ranfor', RandomForestClassifier())
], cv=3, passthrough=True)

modelo.fit(Xtr, ytr)
stack_pred = modelo.predict(Xte)
stack_hits = stack_pred == yte
result = sum(stack_hits)/len(stack_hits)

ClasifierData['classifier'].append('Stacking ((knn1, knn5, knn9), extrat, ranfor)')
ClasifierData['result'].append(result)

result

0.801829268292683

In [35]:
warnings.filterwarnings('ignore')

from sklearn.ensemble import StackingClassifier

modelo = VotingClassifier([
    ('knn', KNeighborsClassifier()),
    ('gnb', GaussianNB()),
    ('per', Perceptron()),
    ('dtc', DecisionTreeClassifier())
])

modelo = StackingClassifier([
    ('voting', voting),
    ('extrat', ExtraTreesClassifier()),
    ('ranfor', RandomForestClassifier())
], cv=3, passthrough=True)

modelo.fit(Xtr, ytr)
stack_pred = modelo.predict(Xte)
stack_hits = stack_pred == yte
result = sum(stack_hits)/len(stack_hits)

ClasifierData['classifier'].append('Stacking ((knn, gnb, per, dtc), extrat, ranfor)')
ClasifierData['result'].append(result)

result

0.801829268292683

# Results

In [36]:
df = pd.DataFrame(ClasifierData)
df

Unnamed: 0,classifier,result
0,KNeighborsClassifier,0.814024
1,GaussianNB,0.807927
2,Perceptron,0.77439
3,DecisionTreeClassifier,0.77439
4,DecisionTreeClassifier (Random),0.780488
5,BaggingClassifier,0.713415
6,RandomForestClassifier,0.789634
7,ExtraTreesClassifier,0.795732
8,BaggingClassifier (Arvore),0.713415
9,AdaBoostClassifier,0.777439
