<a href="https://colab.research.google.com/github/AndreaBertoglio/MLDM/blob/master/Bagging_Ensembles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
import pip
import sys
#if not 'sklearn' in sys.modules.keys():
#    pip.main(['install', 'sklearn'])
#if not 'kaggle' in sys.modules.keys():
#    pip.main(['install', 'kaggle'])
import random

print("Random number with seed 2020")
# first call
random.seed(2020)

Random number with seed 2020


In [None]:
import numpy as np
import pandas as pd
import graphviz

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.tree import DecisionTreeClassifier, export_graphviz #Da cambiare
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt

# Read the data from GitHub
#train = pd.read_csv('https://raw.githubusercontent.com/AndreaBertoglio/MLDM/master/Pre-processing/Data%20Set%20elaborati/trainingSet_Outliers_feature_3_globale_40.csv')
train = pd.read_csv('https://raw.githubusercontent.com/serivan/mldmlab/master/Datasets/Kaggle2020/train.csv')


In [None]:
#Le classi sono 1=Good 0=Disappointing
train["Quality"] = np.where(train["Quality"].str.contains("Good"), 1, 0)
#train["Quality"] = np.where(train["Quality"].astype(str).str.contains("1.0"), 1, 0)

In [None]:
# pull data into target (y) and predictors (X)
#La y è la classe, cioè la Quality
train_y = train.Quality
#seleziono colonne di interesse, non metto l'ID perchè non mi interessa
predictor_cols = ['fixed.acidity','volatile.acidity','citric.acid','residual.sugar','chlorides','free.sulfur.dioxide','total.sulfur.dioxide','density','pH','sulphates','alcohol']

# La x sono gli attributi
train_X = train[predictor_cols]

# Sostituisce i missing values con la media e lo applica alle x

#imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = IterativeImputer(missing_values=np.nan, max_iter=40)
imp = imp.fit(train_X)


# Impute our data, then train
train_X_imp = imp.transform(train_X)


In [None]:
#Uso l'80% dei dati per train e il restante 20% per test
xTrain, xTest, yTrain, yTest = train_test_split(train_X_imp, train_y,  random_state = 0) # train_size = 0.8,

In [None]:
def generateSubmission(myModel, submissionFile, description):
    # Read the test data
    test = pd.read_csv('https://raw.githubusercontent.com/serivan/mldmlab/master/Datasets/Kaggle2020/test.csv')


    # Treat the test data in the same way as training data. In this case, pull same columns.
    test_X = test[predictor_cols]

    # Impute each test item, then predict
    test_X_imp = imp.transform(test_X)
    
    # Use the model to make predictions
    predicted_q = myModel.predict(test_X_imp)
    # We will look at the predicted Qualities to ensure we have something sensible.
    print(predicted_q)
    
    #submission file
    my_submission = pd.DataFrame({'Id': test.Id, 'Quality': predicted_q})
    # you could use any filename. We choose submission here
    my_submission.to_csv(submissionFile, index=False)
    
    #Submit authomatically; kaggle API authentication needed
    #!kaggle competitions submit -c mldm-classification-competition-2020 -f {submissionFile} -m '{description}'

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

bag_clf = BaggingClassifier( 
    DecisionTreeClassifier(random_state=42), 
    n_estimators=200, max_samples=2500, bootstrap=True, oob_score=True, bootstrap_features=True, random_state=42) 

bag_clf = BaggingClassifier(
    #SVC(gamma="scale", tol=1e-3, random_state=42),
    #RandomForestClassifier(random_state=42), 
    DecisionTreeClassifier(random_state=42,min_samples_split=0.01),
    n_estimators=400, max_samples=1.0, bootstrap=True, oob_score=True, random_state=42)
bag_clf.fit(xTrain,yTrain)
y_pred = bag_clf.predict(xTest)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(yTest, y_pred))

0.7880870561282932


In [None]:
my_model=bag_clf
my_model.fit(xTrain, yTrain)
my_model.score(xTrain, yTrain)

0.9074923547400612

In [None]:
# generate a submission file
generateSubmission(my_model,'BaggingEnsembles18.csv', "User defined Bagging Ensembles")

[1 1 1 ... 1 1 1]
