In [79]:
# load some relational database
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# this is a dataset of items customers bought over a week period
store_data = pd.read_csv("../../Datasets/store_data.csv", header=None)
store_data.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


Here i perform this manually  or in the next cell you'll see you can use the transaction encoder from mlxtend

In [80]:
# we need to encode our shopping lists into a one hot encoded dataset. This means we need to get the data into a list of lists format, and find the unique fields.

#convert the data frame to a list of list
data = store_data.values.tolist() 
# remove straggling nans in the lists.
data = [[i for i in j if i == i] for j in data]
from itertools import chain
# get the unique columns from all the strings
unique = np.unique(list(chain(*data)))
# get a true/false matrix of present or not present when compared against the row.
inds = [np.in1d(unique, row).tolist() for row in data]

In [81]:
vector_data = pd.DataFrame(inds, columns=unique)
vector_data.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [82]:
from mlxtend.frequent_patterns import apriori

apriori(vector_data, min_support=0.01, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.020397,(almonds)
1,0.033329,(avocado)
2,0.010799,(barbecue sauce)
3,0.014265,(black tea)
4,0.011465,(body spray)
...,...,...
252,0.011065,"(milk, ground beef, mineral water)"
253,0.017064,"(ground beef, mineral water, spaghetti)"
254,0.015731,"(milk, mineral water, spaghetti)"
255,0.010265,"(mineral water, spaghetti, olive oil)"


This dataset doesn't appear to have many meanningful associations across the 7,5000 items bought. 

Sometimes this happens!

In [83]:
from mlxtend.preprocessing import TransactionEncoder

dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

te = TransactionEncoder()
vdata = pd.DataFrame(te.fit(dataset).transform(dataset), columns=te.columns_)
vdata.head()

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


In [84]:
from mlxtend.frequent_patterns import apriori

frequent_items = apriori(vdata, min_support=0.6, use_colnames=True)

frequent_items

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Kidney Beans, Eggs)"
6,0.6,"(Eggs, Onion)"
7,0.6,"(Kidney Beans, Milk)"
8,0.6,"(Kidney Beans, Onion)"
9,0.6,"(Kidney Beans, Yogurt)"


In [85]:
# add a column to the dataset measuring the lenght of the itemset tuples
frequent_items['length'] = frequent_items['itemsets'].apply(len)
frequent_items

Unnamed: 0,support,itemsets,length
0,0.8,(Eggs),1
1,1.0,(Kidney Beans),1
2,0.6,(Milk),1
3,0.6,(Onion),1
4,0.6,(Yogurt),1
5,0.8,"(Kidney Beans, Eggs)",2
6,0.6,"(Eggs, Onion)",2
7,0.6,"(Kidney Beans, Milk)",2
8,0.6,"(Kidney Beans, Onion)",2
9,0.6,"(Kidney Beans, Yogurt)",2


We can query our items to see where there are groupings greater than two, and with >= 0.8 support

In [86]:
frequent_items[ (frequent_items['length'] == 2) &
                   (frequent_items['support'] >= 0.8) ]

Unnamed: 0,support,itemsets,length
5,0.8,"(Kidney Beans, Eggs)",2


We can also use pandas to query for certain item combinations.

Here we search for whether `Kidney Beans` and `Yogurt` are ever bought together

In [87]:
frequent_items[frequent_items['itemsets'] == {'Kidney Beans', 'Yogurt'}]

Unnamed: 0,support,itemsets,length
9,0.6,"(Kidney Beans, Yogurt)",2


# Ensembles

The goal of ensembling is to combine predcictions from several classifiers/regressors to improve generalizability when compared with a single classifier/regressor

Two main groups of ensembling strategies exist

    - Averaging methods, each algorithm independently trains and they average the predcictions
    
    - Boosting Methods, these algorithms are trained sequentially at each stage the next estimator is trying to reduce the combined bias.

In [88]:
#load the data

#In this example I am using one of the built in datasets
#Some classification problems will give you pre defined train and test splits
#in this case, as we don't have a train test split i make one using the functionality in sklearn



iris = pd.read_csv("../../Datasets/Iris.csv")

X = iris[["sepal_width","sepal_length", "petal_length", "petal_width"]]
y = iris["species"]
print(X)
print(y)

     sepal_width  sepal_length  petal_length  petal_width
0            3.5           5.1           1.4          0.2
1            3.0           4.9           1.4          0.2
2            3.2           4.7           1.3          0.2
3            3.1           4.6           1.5          0.2
4            3.6           5.0           1.4          0.2
..           ...           ...           ...          ...
145          3.0           6.7           5.2          2.3
146          2.5           6.3           5.0          1.9
147          3.0           6.5           5.2          2.0
148          3.4           6.2           5.4          2.3
149          3.0           5.9           5.1          1.8

[150 rows x 4 columns]
0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: species, Length: 150, dtype: obj

In [89]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

## Averaging Methods - Bagging

Bagging is a form of averaging method, where the algorithm builds many instances of the classifier which are trained on random subsets of the original training data. These predictions are then combined to realise a single output. 

Bagging methods are often similar, but differ with respect to how the random samples are drawn.

We can use the `BaggingClassifier` in SkLearn to wrap our base estimator - in this example the `DecisionTree`. 

When training bagging based classifiers, we can estimate the accuracy using the out-of-bag samples.

In [90]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


bagging = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.4)
bagging.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=0.4,
                  max_samples=0.5)

## AdaBoost

The main principle of AdaBoost is to to train a set of weak learners (models that are marginalyl better than random guessing) on repeteadly modified versions of the data.

The predictions are then combined together using a weighted majority vote to produce the final prediction.

The modified data is produced by taking the data and applying the weights to each of the training samples. These weights are initialised such that the first iteration of the algorithm trains on unmodified data. 

For each subsequent learner the weights are updated to and the algorithm is reapplied. 

Intuitively as the algorithm learns, cases that are difficult to predict have ever increasing influence over the algorith, and those that are easy to predict do not. 

In [91]:
from sklearn.ensemble import AdaBoostClassifier

# we can create an adaboost classifier by importing it from the ensemble module.
# the default weak learner is DecisionTreeClassifier with a depth of 1
# typically for ada boost you will have >= 50 estimators.
ada_boost = AdaBoostClassifier(n_estimators=10)
ada_boost.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=10)

## Gradient Tree Boosting

Gradient tree boosting is a generlisation of boosting, and is a useful off-the-shelf tool to help on a variety of problems. 

In [92]:
from sklearn.ensemble import GradientBoostingClassifier

g_boosting = GradientBoostingClassifier(n_estimators=10, learning_rate=1.0)
g_boosting.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=1.0, n_estimators=10)

## Warm Start

GradientBoosting classifiers allows you to use a warm start. This means you can partially train your models, and then add more estimators after the intial round of training

In [93]:
# add an additional 10 estimators. 10 -> 20
# allow warm start
g_boosting.set_params(n_estimators=20, warm_start=True)

# use more data - using X_train again is illustrative
g_boosting.fit(X_train, y_train)


GradientBoostingClassifier(learning_rate=1.0, n_estimators=20, warm_start=True)

## Ensemble Voting (Classifiers)

In majority voting the class label with the greatest number of votes is selected as the output

for example:

- classifier 1 -> class 1
- classifier 2 -> class 1
 classifier 3 -> class 2

votes = `[2, 1]`
class 1 has the greatest number. 


In the event of ties, the class will be based on ascending order. 

- classifier 1 -> class 2
- classifier 2 -> class 1

votes = `[1,1]`

class 1 will be selected.



The ensembles we have considered until now use a single base classifier or estimator, these are collectively referred to as homogenous ensembles.

In this next section  we will consider heterogenous ensembles, where we use a mixture of different types of classifier.

In [94]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import accuracy_score

clf1 = SVC()
clf2 = RandomForestClassifier()
clf3 = KNeighborsClassifier()
clf4 = LogisticRegression()

# pack the classifiers and string into a list of tuples.
ensemble_clf = VotingClassifier(
    [("SVM", clf1), ("Random Forest", clf2), ("kNN", clf3), ("Logistic Regression", clf4)],
    voting="hard")

# cross val and score each classifier.
for name, clf  in [("SVM", clf1), ("Random Forest", clf2), ("kNN", clf3), ("Logistic Regression", clf4), ("Ensemble", ensemble_clf)]:
    clf.fit(X_train, y_train)
    score = accuracy_score(y_test, clf.predict(X_test))
    print(f"{name} accuracy: {score}")


SVM accuracy: 0.9466666666666667
Random Forest accuracy: 0.9466666666666667
kNN accuracy: 0.96


AttributeError: 'str' object has no attribute 'decode'