In [1]:
import pandas as pd
from sklearn.datasets import load_diabetes, load_wine
from sklearn.model_selection import train_test_split as tts

In [2]:
diabetes = pd.DataFrame(load_diabetes().data, columns = load_diabetes().feature_names)
diabetes['target'] = load_diabetes().target

In [3]:
diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [4]:
help(load_diabetes)

Help on function load_diabetes in module sklearn.datasets._base:

load_diabetes(*, return_X_y=False, as_frame=False)
    Load and return the diabetes dataset (regression).
    
    Samples total    442
    Dimensionality   10
    Features         real, -.2 < x < .2
    Targets          integer 25 - 346
    
    Read more in the :ref:`User Guide <diabetes_dataset>`.
    
    Parameters
    ----------
    return_X_y : bool, default=False.
        If True, returns ``(data, target)`` instead of a Bunch object.
        See below for more information about the `data` and `target` object.
    
        .. versionadded:: 0.18
    
    as_frame : bool, default=False
        If True, the data is a pandas DataFrame including columns with
        appropriate dtypes (numeric). The target is
        a pandas DataFrame or Series depending on the number of target columns.
        If `return_X_y` is True, then (`data`, `target`) will be pandas
        DataFrames or Series as described below.
    
      

In [5]:
help(load_wine)

Help on function load_wine in module sklearn.datasets._base:

load_wine(*, return_X_y=False, as_frame=False)
    Load and return the wine dataset (classification).
    
    .. versionadded:: 0.18
    
    The wine dataset is a classic and very easy multi-class classification
    dataset.
    
    Classes                          3
    Samples per class        [59,71,48]
    Samples total                  178
    Dimensionality                  13
    Features            real, positive
    
    Read more in the :ref:`User Guide <wine_dataset>`.
    
    Parameters
    ----------
    return_X_y : bool, default=False.
        If True, returns ``(data, target)`` instead of a Bunch object.
        See below for more information about the `data` and `target` object.
    
    as_frame : bool, default=False
        If True, the data is a pandas DataFrame including columns with
        appropriate dtypes (numeric). The target is
        a pandas DataFrame or Series depending on the number of ta

In [6]:
print(load_wine().DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [7]:
wine = pd.DataFrame(load_wine().data, columns = load_wine().feature_names )
wine['target'] = load_wine().target
wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [8]:
wine['target'].unique()

array([0, 1, 2])

# Gradiente Descendente Estocástico SGD

![](data/grad1.png)

![](data/grad2.jpg)

### Gradiente para regresion

In [9]:
from sklearn.linear_model import SGDRegressor as SGDR

In [10]:
X_train_di, X_test_di, y_train_di,y_test_di = tts(diabetes.drop('target', axis = 1), diabetes['target'])

In [11]:
sgdr = SGDR(max_iter = 10000, penalty = 'l2') 
#Son el numero de iteraciones o pasos que dara el modelo para llegar al punto minimo, es decir a la convergencia

sgdr.fit(X_train_di,y_train_di)

SGDRegressor(max_iter=10000)

In [12]:
train_score = sgdr.score(X_train_di, y_train_di) #"R2" seria que tan bueno es nuestro modelo tanto en el entramiento como en el testeo
test_score = sgdr.score(X_test_di, y_test_di)

In [13]:
print(test_score,train_score)

0.4047300851594975 0.5362387374552366


### Gradiente para clasificacion

In [14]:
from sklearn.linear_model import SGDClassifier as SGDC

In [15]:
X_train_vi, X_test_vi, y_train_vi,y_test_vi = tts(wine.drop('target', axis = 1), wine['target'])

In [17]:
sgdc = SGDC(max_iter = 8000)

sgdc.fit(X_train_vi,y_train_vi)

SGDClassifier(max_iter=8000)

In [18]:
train_score_vi=sgdc.score(X_train_vi, y_train_vi) # Accuracy
test_score_vi=sgdc.score(X_test_vi, y_test_vi)
print(train_score_vi,test_score_vi)

0.6842105263157895 0.7555555555555555


In [19]:
sgdc.__class__.__name__

'SGDClassifier'

# Checando modelos

In [20]:
from sklearn.naive_bayes import GaussianNB as GNB #Bayes para regresion 
from sklearn.naive_bayes import MultinomialNB as MNB #Bayes con variables discretas (categoricas)
from sklearn.naive_bayes import ComplementNB as CNB
from sklearn.naive_bayes import BernoulliNB as BNB  #Se basa en la dsitribucion binomial que a su vez son ensayos de bernoulli
#toma exitos y fracasos
from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.neighbors import KNeighborsClassifier as KNNC


#Modelos boosting
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.ensemble import GradientBoostingClassifier as GBC

from xgboost import XGBRegressor as XGBR#Extreme Gradient Boosting
from xgboost import XGBClassifier as XGBC
from catboost import CatBoostClassifier as CATB
from catboost import CatBoostRegressor as CTR
from lightgbm import LGBMRegressor as LGBMR
from lightgbm import LGBMClassifier as LGBMC



lista_m = [GNB, MNB, CNB, BNB,KNNR,KNNC,GBR,GBC, XGBR,XGBC, CTR,LGBMR, LGBMC]


In [22]:
def modeling_testing(lista_modelos, data, target):
    X_train, X_test, y_train, y_test = tts(data.loc[:, data.columns != target], data[target])
    
    for i in lista_modelos:
        modelo = i()
        modelo.fit(X_train, y_train)
        train_score = modelo.score(X_train, y_train)
        test_score = modelo.score(X_test, y_test)
        print('Modelo:',str(i).split(sep = '.')[-1])
        print('Train score:', train_score,'\nTest score:',test_score,'\n')


In [23]:
modeling_testing(lista_m,wine,'target')

Modelo: GaussianNB'>
Train score: 0.9849624060150376 
Test score: 0.9777777777777777 

Modelo: MultinomialNB'>
Train score: 0.8796992481203008 
Test score: 0.8666666666666667 

Modelo: ComplementNB'>
Train score: 0.6541353383458647 
Test score: 0.6888888888888889 

Modelo: BernoulliNB'>
Train score: 0.39849624060150374 
Test score: 0.4 

Modelo: KNeighborsRegressor'>
Train score: 0.6345654648956358 
Test score: 0.6901153212520592 

Modelo: KNeighborsClassifier'>
Train score: 0.7819548872180451 
Test score: 0.7111111111111111 

Modelo: GradientBoostingRegressor'>
Train score: 0.9999999992944921 
Test score: 0.7859362361516179 

Modelo: GradientBoostingClassifier'>
Train score: 1.0 
Test score: 0.9111111111111111 

Modelo: XGBRegressor'>
Train score: 0.9999998285802233 
Test score: 0.7521874246685439 

Modelo: XGBClassifier'>
Train score: 1.0 
Test score: 0.9555555555555556 

Learning rate set to 0.027255
0:	learn: 0.7569168	total: 136ms	remaining: 2m 15s
1:	learn: 0.7466186	total: 137ms

185:	learn: 0.0659407	total: 285ms	remaining: 1.25s
186:	learn: 0.0654730	total: 285ms	remaining: 1.24s
187:	learn: 0.0650300	total: 287ms	remaining: 1.24s
188:	learn: 0.0646221	total: 288ms	remaining: 1.24s
189:	learn: 0.0641374	total: 289ms	remaining: 1.23s
190:	learn: 0.0637899	total: 289ms	remaining: 1.23s
191:	learn: 0.0634642	total: 290ms	remaining: 1.22s
192:	learn: 0.0629795	total: 291ms	remaining: 1.22s
193:	learn: 0.0625375	total: 292ms	remaining: 1.21s
194:	learn: 0.0621272	total: 292ms	remaining: 1.21s
195:	learn: 0.0618257	total: 293ms	remaining: 1.2s
196:	learn: 0.0613860	total: 294ms	remaining: 1.2s
197:	learn: 0.0611597	total: 295ms	remaining: 1.19s
198:	learn: 0.0608023	total: 295ms	remaining: 1.19s
199:	learn: 0.0604223	total: 296ms	remaining: 1.19s
200:	learn: 0.0601157	total: 297ms	remaining: 1.18s
201:	learn: 0.0597987	total: 298ms	remaining: 1.18s
202:	learn: 0.0594946	total: 299ms	remaining: 1.17s
203:	learn: 0.0592624	total: 300ms	remaining: 1.17s
204:	learn: 0.

367:	learn: 0.0339920	total: 434ms	remaining: 745ms
368:	learn: 0.0338986	total: 435ms	remaining: 744ms
369:	learn: 0.0337524	total: 436ms	remaining: 742ms
370:	learn: 0.0336645	total: 436ms	remaining: 740ms
371:	learn: 0.0335370	total: 437ms	remaining: 738ms
372:	learn: 0.0334401	total: 438ms	remaining: 736ms
373:	learn: 0.0333823	total: 439ms	remaining: 734ms
374:	learn: 0.0333076	total: 439ms	remaining: 732ms
375:	learn: 0.0331529	total: 440ms	remaining: 730ms
376:	learn: 0.0329307	total: 441ms	remaining: 728ms
377:	learn: 0.0328752	total: 441ms	remaining: 726ms
378:	learn: 0.0326517	total: 442ms	remaining: 724ms
379:	learn: 0.0325645	total: 443ms	remaining: 723ms
380:	learn: 0.0323352	total: 444ms	remaining: 722ms
381:	learn: 0.0323049	total: 445ms	remaining: 720ms
382:	learn: 0.0322713	total: 446ms	remaining: 718ms
383:	learn: 0.0321883	total: 446ms	remaining: 716ms
384:	learn: 0.0320104	total: 447ms	remaining: 715ms
385:	learn: 0.0319361	total: 448ms	remaining: 713ms
386:	learn: 

539:	learn: 0.0180456	total: 592ms	remaining: 505ms
540:	learn: 0.0179682	total: 593ms	remaining: 503ms
541:	learn: 0.0179209	total: 594ms	remaining: 502ms
542:	learn: 0.0178471	total: 595ms	remaining: 501ms
543:	learn: 0.0177686	total: 596ms	remaining: 500ms
544:	learn: 0.0177049	total: 598ms	remaining: 499ms
545:	learn: 0.0176128	total: 599ms	remaining: 498ms
546:	learn: 0.0175576	total: 600ms	remaining: 497ms
547:	learn: 0.0174517	total: 601ms	remaining: 495ms
548:	learn: 0.0173701	total: 601ms	remaining: 494ms
549:	learn: 0.0173326	total: 603ms	remaining: 494ms
550:	learn: 0.0173067	total: 604ms	remaining: 493ms
551:	learn: 0.0172668	total: 605ms	remaining: 491ms
552:	learn: 0.0171660	total: 606ms	remaining: 490ms
553:	learn: 0.0170807	total: 607ms	remaining: 488ms
554:	learn: 0.0169927	total: 607ms	remaining: 487ms
555:	learn: 0.0168994	total: 608ms	remaining: 485ms
556:	learn: 0.0167685	total: 609ms	remaining: 484ms
557:	learn: 0.0166689	total: 609ms	remaining: 483ms
558:	learn: 

704:	learn: 0.0092103	total: 731ms	remaining: 306ms
705:	learn: 0.0091631	total: 732ms	remaining: 305ms
706:	learn: 0.0091519	total: 733ms	remaining: 304ms
707:	learn: 0.0091209	total: 734ms	remaining: 303ms
708:	learn: 0.0090929	total: 734ms	remaining: 301ms
709:	learn: 0.0090438	total: 735ms	remaining: 300ms
710:	learn: 0.0089933	total: 736ms	remaining: 299ms
711:	learn: 0.0089703	total: 736ms	remaining: 298ms
712:	learn: 0.0089320	total: 737ms	remaining: 297ms
713:	learn: 0.0088874	total: 738ms	remaining: 296ms
714:	learn: 0.0088539	total: 739ms	remaining: 294ms
715:	learn: 0.0088108	total: 739ms	remaining: 293ms
716:	learn: 0.0087800	total: 740ms	remaining: 292ms
717:	learn: 0.0087528	total: 741ms	remaining: 291ms
718:	learn: 0.0087189	total: 741ms	remaining: 290ms
719:	learn: 0.0086794	total: 742ms	remaining: 289ms
720:	learn: 0.0086397	total: 743ms	remaining: 287ms
721:	learn: 0.0086084	total: 743ms	remaining: 286ms
722:	learn: 0.0085703	total: 744ms	remaining: 285ms
723:	learn: 

877:	learn: 0.0047017	total: 880ms	remaining: 122ms
878:	learn: 0.0046936	total: 881ms	remaining: 121ms
879:	learn: 0.0046847	total: 882ms	remaining: 120ms
880:	learn: 0.0046573	total: 883ms	remaining: 119ms
881:	learn: 0.0046392	total: 883ms	remaining: 118ms
882:	learn: 0.0046224	total: 884ms	remaining: 117ms
883:	learn: 0.0046068	total: 885ms	remaining: 116ms
884:	learn: 0.0045864	total: 886ms	remaining: 115ms
885:	learn: 0.0045558	total: 886ms	remaining: 114ms
886:	learn: 0.0045379	total: 887ms	remaining: 113ms
887:	learn: 0.0045138	total: 888ms	remaining: 112ms
888:	learn: 0.0045010	total: 889ms	remaining: 111ms
889:	learn: 0.0044843	total: 889ms	remaining: 110ms
890:	learn: 0.0044628	total: 890ms	remaining: 109ms
891:	learn: 0.0044385	total: 891ms	remaining: 108ms
892:	learn: 0.0044215	total: 892ms	remaining: 107ms
893:	learn: 0.0044067	total: 893ms	remaining: 106ms
894:	learn: 0.0043933	total: 894ms	remaining: 105ms
895:	learn: 0.0043846	total: 894ms	remaining: 104ms
896:	learn: 

# K-Nearest Neighbors

![](data/knn.png)

# Ensemble Models


### Bagging  -  Boosting


![](data/boost.png)

#### CATBOOST

In [24]:
#FRAMEWORK es una 'libreria' que puede usar python para hacer boosting con gradiente descendiente y arboles de desiciones
import numpy as np

from catboost import CatBoostClassifier, Pool

# initialize data
train_data = np.random.randint(0,
                               100, 
                               size=(100, 10))

train_labels = np.random.randint(0,
                                 2,
                                 size=(100))

test_data = catboost_pool = Pool(train_data, 
                                 train_labels)

model = CatBoostClassifier(iterations=2,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           silent = True)

model.fit(train_data, train_labels)
# make the prediction using the resulting model
preds_class = model.predict(test_data)
preds_proba = model.predict_proba(test_data)
print("class = ", preds_class)
print("proba = ", preds_proba)

class =  [0 0 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 1 0 1 1 1 0 1 0 1 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 1 0 1 0 0 1 0 1 0
 0 1 1 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 1 0 1 0 1 0 1 0]
proba =  [[0.54381614 0.45618386]
 [0.67030015 0.32969985]
 [0.412347   0.587653  ]
 [0.54381614 0.45618386]
 [0.412347   0.587653  ]
 [0.38439808 0.61560192]
 [0.54381614 0.45618386]
 [0.54381614 0.45618386]
 [0.38439808 0.61560192]
 [0.412347   0.587653  ]
 [0.54381614 0.45618386]
 [0.412347   0.587653  ]
 [0.54381614 0.45618386]
 [0.44416459 0.55583541]
 [0.38439808 0.61560192]
 [0.67030015 0.32969985]
 [0.54381614 0.45618386]
 [0.5286364  0.4713636 ]
 [0.44416459 0.55583541]
 [0.31389282 0.68610718]
 [0.412347   0.587653  ]
 [0.44416459 0.55583541]
 [0.44416459 0.55583541]
 [0.43733238 0.56266762]
 [0.54381614 0.45618386]
 [0.54381614 0.45618386]
 [0.54381614 0.45618386]
 [0.54381614 0.45618386]
 [0.67030015 0.32969985]
 [0.54381614 0.45618386]
 [0.67030015 0.32969985]
 [0

# H2o

In [25]:
import h2o
from h2o.automl import H2OAutoML

In [26]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 14.0.2+12-46, mixed mode, sharing)
  Starting server from C:\Users\GralG\AppData\Roaming\Python\Python37\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\GralG\AppData\Local\Temp\tmphwtii2r4
  JVM stdout: C:\Users\GralG\AppData\Local\Temp\tmphwtii2r4\h2o_GralG_started_from_python.out
  JVM stderr: C:\Users\GralG\AppData\Local\Temp\tmphwtii2r4\h2o_GralG_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Mexico_City
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.2
H2O_cluster_version_age:,5 days
H2O_cluster_name:,H2O_from_python_GralG_32je03
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.965 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [27]:
help(h2o.frame)

Help on function frame in module h2o.h2o:

frame(frame_id)
    Retrieve metadata for an id that points to a Frame.
    
    :param frame_id: the key of a Frame in H2O.
    
    :returns: dict containing the frame meta-information.
    
    :examples:
    
    >>> training_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")
    >>> frame_summary = h2o.frame(training_data.frame_id)
    >>> frame_summary



In [None]:
training_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv")

In [None]:
training_data

In [None]:
aml=H2OAutoML(max_models=20, seed=1)

In [None]:
aml.train(x=training_data.drop('economy_20mpg',axis = 1).columns, y='economy_20mpg', training_frame = training_data)

# Stacking


![](data/stacking.png)