<a href="https://colab.research.google.com/github/BustamJos3/ia4eng_jdbs_afmp_project/blob/WorkOnModelsLearningPlots%2FBustamJos3/03_modelos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Contents
* loading from 02-preprocesado.ipynb
* partition train_test

# Modules

In [1]:
#modules
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split

# Loading

In [3]:
#load dataset from github in .json
df_prepro=pd.read_json("https://raw.githubusercontent.com/BustamJos3/ia4eng_jdbs_afmp_project/main/02preprocesado.json",orient='index')
df_prepro.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area4
0,2596.0,51.0,3.0,258,0,510,221.0,148.0,6279,0
1,2590.0,56.0,2.0,212,-6,390,220.0,151.0,6225,0
2,2804.0,139.0,9.0,268,65,3180,234.0,135.0,6121,0
3,2785.0,155.0,18.0,242,118,3090,238.0,122.0,6211,0
4,2595.0,45.0,2.0,153,-1,391,220.0,150.0,6172,0


In [4]:
#load dataset from github in .json
df_explor=pd.read_json("https://raw.githubusercontent.com/BustamJos3/ia4eng_jdbs_afmp_project/WorkOnModelsLearningPlots/BustamJos3/01exploracion.json",orient='index')['Cover_Type']
df_explor.head()

0    5
1    5
2    2
3    2
4    5
Name: Cover_Type, dtype: int64

# TrainTest Splitting

In [5]:
# get x,y
X,y=df_prepro, df_explor
#TrainTestSplit 1/3 of data, shuffle data before splitting
X_train, X_testValidation, y_train, y_testValidation = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

# Hyperparameters


In [6]:
#hyperparameter selection method
from sklearn.model_selection import GridSearchCV

## Supervised

### Random forest

In [None]:
#import model
from sklearn.ensemble import RandomForestClassifier
#create model
RForest=RandomForestClassifier()

In [None]:
#which are the parameters in this case?
RForest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

#### What are the chosen?
For this model, 3 parameters will be chosen:
* ```'n_estimators'```
* ```'max_samples'```
* ```'criterion'```

In [None]:
#dictionary with selected parameters to begin the hyperparemeter search
parameters={'criterion':('gini','entropy'),
            'n_estimators':(10,20,30),
            'max_samples':(1/4,2/4,3/4)
            }
#create gridSearch variable
gridCV_RF=GridSearchCV(RForest,parameters,scoring='accuracy')

In [None]:
#fit the grid
#fit works with numpy arrays, so .values on df's
gridCV_RF.fit(X_train.values,y_train.values)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_samples': (0.25, 0.5, 0.75),
                         'n_estimators': (10, 20, 30)},
             scoring='accuracy')

In [None]:
#ranking of combinations
print(gridCV_RF.cv_results_['rank_test_score'])
#accuracy value of combinations
print(gridCV_RF.cv_results_['mean_test_score'])

[18 16 12 14  8  4 10  5  2 17 13 11 15  7  6  9  3  1]
[0.72744324 0.74728529 0.75666338 0.75281343 0.77176703 0.78173741
 0.76327739 0.78144126 0.79121422 0.73050346 0.75538006 0.75903258
 0.75251728 0.77295163 0.77956565 0.76821323 0.78262586 0.7946693 ]


In [None]:
#best accuracy result
print(gridCV_RF.best_score_)
#best hyperparameter
print(gridCV_RF.best_params_)

0.7946692991115499
{'criterion': 'entropy', 'max_samples': 0.75, 'n_estimators': 30}


### K near neaighbors (KNN)

In [None]:
#import model
from sklearn.neighbors import KNeighborsClassifier
#because KNN is suceptible to the data scales
from sklearn.preprocessing import MinMaxScaler
#create model
#amount of neighbors=sqrt of amount of data
Knn=KNeighborsClassifier()

In [None]:
#what are the parameters?
Knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

#### What are the chosen?
For this model, 2 parameters will be chosen:
* ```'algorithm'```
* ```'n_neighbors'```

In [None]:
# amount of neighbors
neighbors=int(np.sqrt(len(X_train)))
#dictionary with selected parameters to begin the hyperparemeter search
parameters={'algorithm':('ball_tree','kd_tree','brute'),
            'n_neighbors':(neighbors,neighbors-30,neighbors-40)
            }
#create gridSearch variable
gridCV_knn=GridSearchCV(Knn,parameters,scoring='accuracy')

In [None]:
#fit the grid
#fit works with numpy arrays, so .values on df's
#apply scaler 'cause of suceptibility of model to data scales
scaler=MinMaxScaler().fit(X_train.values)
gridCV_knn.fit(scaler.transform(X_train.values),y_train.values)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ('ball_tree', 'kd_tree', 'brute'),
                         'n_neighbors': (100, 70, 60)},
             scoring='accuracy')

In [None]:
#ranking of combinations
print(gridCV_knn.cv_results_['rank_test_score'])
#accuracy value of combinations
print(gridCV_knn.cv_results_['mean_test_score'])

[7 4 1 7 4 1 7 4 1]
[0.62931885 0.64067127 0.64827246 0.62931885 0.64067127 0.64827246
 0.62931885 0.64067127 0.64827246]


In [None]:
#best accuracy result
print(gridCV_knn.best_score_)
#best hyperparameter
print(gridCV_knn.best_params_)

0.6482724580454098
{'algorithm': 'ball_tree', 'n_neighbors': 60}


## Unsupervised models

### K-means

In [76]:
from sklearn.cluster import KMeans
kMeans=KMeans(n_clusters=7)

In [None]:
kMeans.get_params()

{'algorithm': 'auto',
 'copy_x': True,
 'init': 'k-means++',
 'max_iter': 300,
 'n_clusters': 7,
 'n_init': 10,
 'random_state': None,
 'tol': 0.0001,
 'verbose': 0}

#### What are the chosen?
For this model, 3 parameters will be chosen:
* ```'init'```
* ```'tol'```

In [None]:
#dictionary with selected parameters to begin the hyperparemeter search
parameters={'init':('k-means++', 'random'),
            'tol': [0.001,0.0001],
            }
#create gridSearch variable
gridCV_KMeans=GridSearchCV(kMeans,parameters)

In [None]:
#fitting
gridCV_KMeans.fit(X_train.values)

GridSearchCV(estimator=KMeans(n_clusters=7),
             param_grid={'init': ('k-means++', 'random'),
                         'tol': [0.001, 0.0001]})

In [None]:
#ranking of combinations
print(gridCV_KMeans.cv_results_['rank_test_score'])

[2 4 1 3]


In [None]:
#best hyperparameter
print(gridCV_KMeans.best_params_)

{'init': 'random', 'tol': 0.001}


In [96]:
#lets check its properties
bestKmeans=KMeans(n_clusters=7,init='random',tol=0.001)

In [97]:
# fitting
bestKmeans.fit(X_train.values)

KMeans(init='random', n_clusters=7, tol=0.001)

In [98]:
#syntethic accuracy train
len((bestKmeans.labels_+1==y_train)[(bestKmeans.labels_+1==y_train)==True])/len(X_train)

0.21628825271470878

In [99]:
#syntethic accuracy test
len((bestKmeans.predict(X_testValidation.values)+1==y_testValidation.values)[(bestKmeans.predict(X_testValidation.values)+1==y_testValidation.values)==True])/len(X_testValidation.values)

0.21062124248496994

### MiniBatchKMeans

In [65]:
from sklearn.cluster import MiniBatchKMeans

In [66]:
miniKMeansCluster=MiniBatchKMeans(n_clusters=7)

In [67]:
miniKMeansCluster.get_params()

{'batch_size': 1024,
 'compute_labels': True,
 'init': 'k-means++',
 'init_size': None,
 'max_iter': 100,
 'max_no_improvement': 10,
 'n_clusters': 7,
 'n_init': 3,
 'random_state': None,
 'reassignment_ratio': 0.01,
 'tol': 0.0,
 'verbose': 0}

#### What are the chosen?
For this model, 3 parameters will be chosen:
* ```'affinity'```
* ```'linkage'```

In [69]:
#dictionary with selected parameters to begin the hyperparemeter search
parameters={'init':('k-means++', 'random'),
            'tol': [0.001,0.0001],
            }
gridCV_miniKC=GridSearchCV(miniKMeansCluster,parameters)

In [102]:
#fitting
gridCV_miniKC.fit(X_train.values)

GridSearchCV(estimator=MiniBatchKMeans(n_clusters=7),
             param_grid={'init': ('k-means++', 'random'),
                         'tol': [0.001, 0.0001]})

In [103]:
#ranking of combinations
print(gridCV_miniKC.cv_results_['rank_test_score'])

[2 1 4 3]


In [104]:
#best hyperparameter
print(gridCV_miniKC.best_params_)

{'init': 'k-means++', 'tol': 0.0001}


In [105]:
#lets check its properties
bestminiKmeans=MiniBatchKMeans(n_clusters=7,init='k-means++',tol=0.0001)

In [115]:
# fitting
bestminiKmeans.fit(X_train.values)

MiniBatchKMeans(n_clusters=7, tol=0.0001)

In [116]:
#syntethic accuracy train
len((bestminiKmeans.labels_+1==y_train)[(bestminiKmeans.labels_+1==y_train)==True])/len(X_train)

0.18084896347482723

In [117]:
#syntethic accuracy test
len((bestminiKmeans.predict(X_testValidation.values)+1==y_testValidation.values)[(bestminiKmeans.predict(X_testValidation.values)+1==y_testValidation.values)==True])/len(X_testValidation.values)

0.18056112224448898