<a href="https://colab.research.google.com/github/AndreaBertoglio/MLDM/blob/master/Ensemble_Learner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20.

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
import pip
import sys
#if not 'sklearn' in sys.modules.keys():
#    pip.main(['install', 'sklearn'])
#if not 'kaggle' in sys.modules.keys():
#    pip.main(['install', 'kaggle'])
import random

print("Random number with seed 2020")
# first call
random.seed(2020)

Random number with seed 2020


# Voting classifiers

In [3]:
import numpy as np
import pandas as pd
import graphviz

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.tree import DecisionTreeClassifier, export_graphviz #Da cambiare
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt

# Read the data from GitHub
train = pd.read_csv('https://raw.githubusercontent.com/AndreaBertoglio/MLDM/master/Pre-processing/Data%20Set%20elaborati/trainingSet_Outliers_feature_3_globale_40.csv')
#train = pd.read_csv('https://raw.githubusercontent.com/serivan/mldmlab/master/Datasets/Kaggle2020/train.csv')


**Note**: to be future-proof, we set `solver="lbfgs"`, `n_estimators=100`, and `gamma="scale"` since these will be the default values in upcoming Scikit-Learn versions.

In [4]:
#Le classi sono 1=Good 0=Disappointing
#train["Quality"] = np.where(train["Quality"].str.contains("Good"), 1, 0)
train["Quality"] = np.where(train["Quality"].astype(str).str.contains("1.0"), 1, 0)

In [5]:
# pull data into target (y) and predictors (X)
#La y è la classe, cioè la Quality
train_y = train.Quality
#seleziono colonne di interesse, non metto l'ID perchè non mi interessa
predictor_cols = ['fixed.acidity','volatile.acidity','citric.acid','residual.sugar','chlorides','free.sulfur.dioxide','total.sulfur.dioxide','density','pH','sulphates','alcohol']

# La x sono gli attributi
train_X = train[predictor_cols]

# Sostituisce i missing values con la media e lo applica alle x

#imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = IterativeImputer(missing_values=np.nan, max_iter=100, initial_strategy='mean')
imp = imp.fit(train_X)


# Impute our data, then train
train_X_imp = imp.transform(train_X)


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

#X, y = make_moons(n_samples=1000, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(train_X_imp,train_y, random_state=42) #X,y  train_size = 0.8 train_X_imp,train_y

In [7]:
def generateSubmission(myModel, submissionFile, description):
    # Read the test data
    #test = pd.read_csv('https://raw.githubusercontent.com/serivan/mldmlab/master/Datasets/Kaggle2020/test.csv')
    test = pd.read_csv('https://raw.githubusercontent.com/AndreaBertoglio/MLDM/master/Pre-processing/Data%20Set%20elaborati/Test%20Set/test_set_scalato.csv')

    # Treat the test data in the same way as training data. In this case, pull same columns.
    test_X = test[predictor_cols]

    # Impute each test item, then predict
    test_X_imp = imp.transform(test_X)
    
    # Use the model to make predictions
    predicted_q = myModel.predict(test_X_imp)
    # We will look at the predicted Qualities to ensure we have something sensible.
    print(predicted_q)
    
    #submission file
    my_submission = pd.DataFrame({'Id': test.Id, 'Quality': predicted_q})
    # you could use any filename. We choose submission here
    my_submission.to_csv(submissionFile, index=False)
    
    #Submit authomatically; kaggle API authentication needed
    #!kaggle competitions submit -c mldm-classification-competition-2020 -f {submissionFile} -m '{description}'

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

try:
    import xgboost
except ImportError as ex:
    print("Error: the xgboost library is not installed.")
    xgboost = None
if xgboost is not None:  # not shown in the book
    xgb_clf = xgboost.XGBClassifier(random_state=42, max_depth=7, min_samples_leaf=7, tol=1e-3)
ada_clf = AdaBoostClassifier(n_estimators=300,random_state=42,learning_rate=0.2)
#dec_clf = DecisionTreeClassifier(random_state=42, max_depth=8, min_samples_leaf=7)
#log_clf = LogisticRegression(solver="lbfgs", max_iter=300, random_state=42) 
rnd_clf = RandomForestClassifier(n_estimators=300, random_state=42, min_samples_leaf=7, oob_score=True) 
#svm_clf = SVC(gamma="scale", probability=True, C=0.1, tol=1e-3, random_state=42) 
bag_clf = BaggingClassifier( 
    DecisionTreeClassifier(random_state=42), n_estimators=300,bootstrap_features=True, bootstrap=True, oob_score=True, random_state=42)
#voting_clf = VotingClassifier( 
 #   estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf), ('bc', bag_clf)], voting='soft')

voting_clf = VotingClassifier(
    estimators=[('bc', bag_clf), ('xgb', xgb_clf), ('ada', ada_clf),('rf', rnd_clf)],#, ('dec', dec_clf)],#,('lr', log_clf), ], # ('svc', svm_clf),('rf', rnd_clf), ('dec', dec_clf), ('bc', bag_clf)], #('lr', log_clf), 
    voting='hard')

In [10]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('bc',
                              BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                      class_weight=None,
                                                                                      criterion='gini',
                                                                                      max_depth=None,
                                                                                      max_features=None,
                                                                                      max_leaf_nodes=None,
                                                                                      min_impurity_decrease=0.0,
                                                                                      min_impurity_split=None,
                                                                                      min_samples_leaf=1,
          

In [12]:
from sklearn.metrics import accuracy_score

for clf in (ada_clf,bag_clf,rnd_clf,xgb_clf, voting_clf): #,dec_clf,log_clf,dec_clf,rnd_clf
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

AdaBoostClassifier 0.7514450867052023
BaggingClassifier 0.8080924855491329
RandomForestClassifier 0.7838150289017342
XGBClassifier 0.8011560693641618
VotingClassifier 0.8034682080924855


In [13]:
# generate a submission file
generateSubmission(voting_clf,'Ensemble_23.csv', "User Ensemble Classifier")

[1 1 1 ... 1 1 1]
