## Projeto Glass Identification

1. Id number: 1 to 214
2. RI: refractive index
3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
4. Mg: Magnesium
5. Al: Aluminum
6. Si: Silicon
7. K: Potassium
8. Ca: Calcium
9. Ba: Barium
10. Fe: Iron
11. Type of glass: (class attribute)
     -- 1 building_windows_float_processed
     -- 2 building_windows_non_float_processed
     -- 3 vehicle_windows_float_processed
     -- 4 vehicle_windows_non_float_processed (none in this database)
     -- 5 containers
     -- 6 tableware
     -- 7 headlamps

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
!pip install -q huggingface_hub

In [4]:
!pip install -q joblib

In [5]:
!pip install ucimlrepo



In [6]:
import pandas as pd

In [7]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
Y = glass_identification.data.targets 
  
# metadata 
#print(glass_identification.metadata) 
  
# variable information 
print(glass_identification.variables) 

             name     role         type demographic       description  \
0       Id_number       ID      Integer        None              None   
1              RI  Feature   Continuous        None  refractive index   
2              Na  Feature   Continuous        None            Sodium   
3              Mg  Feature   Continuous        None         Magnesium   
4              Al  Feature   Continuous        None          Aluminum   
5              Si  Feature   Continuous        None           Silicon   
6               K  Feature   Continuous        None         Potassium   
7              Ca  Feature   Continuous        None           Calcium   
8              Ba  Feature   Continuous        None            Barium   
9              Fe  Feature   Continuous        None              Iron   
10  Type_of_glass   Target  Categorical        None              None   

                                    units missing_values  
0                                    None             no  
1    

In [8]:
X.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0


In [9]:
Y.head()

Unnamed: 0,Type_of_glass
0,1
1,1
2,1
3,1
4,1


## Montagem de Interface

In [10]:
df = X.copy()
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0


In [11]:
df.columns.values

array(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype=object)

In [12]:
len(df.columns.values)

9

In [13]:
atributos = list(df.columns.values)
print(atributos)

['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']


In [14]:
#st.sidebar.title("Informe os dados do vinho")

# guardar os valores
atributos_valores = {}
for atributo in atributos:
    minimo, media, maximo = df[atributo].min(), df[atributo].mean(), df[atributo].max()
    atributos_valores[atributo] = {"min": minimo, "media": media, "max": maximo }

atributos_valores

{'RI': {'min': 1.51115, 'media': 1.5183654205607477, 'max': 1.53393},
 'Na': {'min': 10.73, 'media': 13.407850467289718, 'max': 17.38},
 'Mg': {'min': 0.0, 'media': 2.684532710280374, 'max': 4.49},
 'Al': {'min': 0.29, 'media': 1.444906542056075, 'max': 3.5},
 'Si': {'min': 69.81, 'media': 72.65093457943925, 'max': 75.41},
 'K': {'min': 0.0, 'media': 0.4970560747663551, 'max': 6.21},
 'Ca': {'min': 5.43, 'media': 8.95696261682243, 'max': 16.19},
 'Ba': {'min': 0.0, 'media': 0.17504672897196263, 'max': 3.15},
 'Fe': {'min': 0.0, 'media': 0.05700934579439253, 'max': 0.51}}

In [15]:
# As Classes
Y['Type_of_glass'].unique()

array([1, 2, 3, 5, 6, 7], dtype=int64)

In [16]:
Y['Type_of_glass'].value_counts() / len(Y) * 100

Type_of_glass
2    35.514019
1    32.710280
7    13.551402
3     7.943925
5     6.074766
6     4.205607
Name: count, dtype: float64

# Criando o modelo

## Separação dos dados em Treino e Teste

In [17]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=65) #20% em teste

x_train.shape, x_test.shape

((171, 9), (43, 9))

# O Algortimo XGBoost - Algoritmo Ensemble baseado em Árvores de Decisão

In [18]:
# Install
!pip install -q xgboost

In [19]:
import xgboost as xgb

# Preparação dos dados de Saída

In [20]:
set(y_train)
y_train.head()

Unnamed: 0,Type_of_glass
208,7
17,1
92,2
165,5
196,7


# Treinar o modelo com dados de entrada (X) e atributo de saída (y)

In [21]:
import numpy as np

## Funções para converter os Labels (y) para o Modelo XGBosst

In [22]:
def converte_clasficacao_XGBoost_para_classe(classificacao_vidro_XGBoost):
    
    if classificacao_vidro_XGBoost == 0:
        classificacao_vidro = 1
    elif classificacao_vidro_XGBoost == 1:
        classificacao_vidro = 2
    elif classificacao_vidro_XGBoost == 2:
        classificacao_vidro = 3
    elif classificacao_vidro_XGBoost == 3:
        classificacao_vidro = 5
    elif classificacao_vidro_XGBoost == 4:
        classificacao_vidro = 6
    elif classificacao_vidro_XGBoost == 5:
        classificacao_vidro = 7

    return classificacao_vidro 

In [23]:
def converte_clasficacao_classe_para_XGBoost(classificacao_vidro):
    
    if classificacao_vidro == 1:
        classificacao_vidro = 0
    elif classificacao_vidro == 2:
        classificacao_vidro = 1
    elif classificacao_vidro == 3:
        classificacao_vidro = 2
    elif classificacao_vidro == 5:
        classificacao_vidro = 3
    elif classificacao_vidro == 6:
        classificacao_vidro = 4
    elif classificacao_vidro == 7:
        classificacao_vidro = 5

    return classificacao_vidro

In [24]:
labels = [1, 2, 3, 5, 6, 7]
print(labels)

[1, 2, 3, 5, 6, 7]


In [25]:
labels_XGBoost = [0, 1, 2, 3, 4, 5]
labels_XGBoost

[0, 1, 2, 3, 4, 5]

In [26]:
for classe in labels:
    print('classe Vidro: ' + str(classe) + ' - Classe XGBoost: '  +  \
           str(converte_clasficacao_classe_para_XGBoost(classe)))

classe Vidro: 1 - Classe XGBoost: 0
classe Vidro: 2 - Classe XGBoost: 1
classe Vidro: 3 - Classe XGBoost: 2
classe Vidro: 5 - Classe XGBoost: 3
classe Vidro: 6 - Classe XGBoost: 4
classe Vidro: 7 - Classe XGBoost: 5


## Preparar os dados para o algoritmo XGBoost - Converter/transformar os dados 

In [27]:
y = list()

for classe in list(y_train.values):

    y_train_XGBoost = converte_clasficacao_classe_para_XGBoost(classe)
    #print(str(classe) + ' - ' + str(y_train_XGBoost))
    y.append(y_train_XGBoost)
    
print(y[0:5])

[5, 0, 1, 3, 5]


In [28]:
set(y)

{0, 1, 2, 3, 4, 5}

## Treinar o modelo XGboost

In [29]:
%%time
learning_rate = 1.1   
xgb_classifier = xgb.XGBClassifier(eta = learning_rate , n_estimators=200)
xgb_classifier.fit(x_train.values, y)
xgb_classifier

CPU times: total: 375 ms
Wall time: 123 ms


## Comparando os 2 modelos - Métricas

In [30]:
y = list()

for classe in list(y_train.values):

    y_train_XGBoost = converte_clasficacao_classe_para_XGBoost(classe)
    #print(str(classe) + ' - ' + str(y_train_XGBoost))
    y.append(y_train_XGBoost)
    
print(y[0:5])

[5, 0, 1, 3, 5]


In [31]:
from sklearn.metrics import confusion_matrix, accuracy_score

y_pred_XGBoost = xgb_classifier.predict(x_test)

y_test_modelo = list()
for classe in list(y_test.values):
    y_train_XGBoost = converte_clasficacao_classe_para_XGBoost(classe)
    #print(str(classe) + ' - ' + str(y_train_XGBoost))
    y_test_modelo.append(y_train_XGBoost)

#y_pred_XGBoost = y_test_modelo

acc_XGBoost = accuracy_score(y_test_modelo, y_pred_XGBoost)
print('Acurácia do modelo XGBoost:', acc_XGBoost)
cm = confusion_matrix(y_test_modelo, y_pred_XGBoost)
print('Matriz de Confusão - XGBoost:\n',cm)

Acurácia do modelo XGBoost: 0.7441860465116279
Matriz de Confusão - XGBoost:
 [[12  4  0  0  0  0]
 [ 3 12  0  0  1  0]
 [ 1  1  2  0  0  0]
 [ 0  0  0  2  0  1]
 [ 0  0  0  0  1  0]
 [ 0  0  0  0  0  3]]


## Dados do Modelo

In [32]:
modelo = xgb_classifier

In [33]:
modelo.get_params()

{'objective': 'multi:softprob',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 200,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'eta': 1.1}

In [34]:
modelo.get_metadata_routing()

{'fit': {'sample_weight': None, 'base_margin': None, 'eval_set': None, 'eval_metric': None, 'early_stopping_rounds': None, 'verbose': None, 'xgb_model': None, 'sample_weight_eval_set': None, 'base_margin_eval_set': None, 'feature_weights': None, 'callbacks': None}, 'predict': {'output_margin': None, 'validate_features': None, 'base_margin': None, 'iteration_range': None}, 'predict_proba': {'validate_features': None, 'base_margin': None, 'iteration_range': None}, 'score': {'sample_weight': None}}

In [35]:
dir(modelo)

['_Booster',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_is_fitted__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_can_use_inplace_predict',
 '_check_feature_names',
 '_check_n_features',
 '_configure_fit',
 '_create_dmatrix',
 '_estimator_type',
 '_get_default_requests',
 '_get_iteration_range',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_get_type',
 '_load_model_attributes',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_set_evaluation_result',
 '_validate_data',
 '_validate_params',
 'apply',
 'base_score',
 'best_iteration',
 'best_scor

In [36]:
modelo.classes_

array([0, 1, 2, 3, 4, 5])

In [37]:
modelo.n_features_in_

9

# Os Atributos mais Importantes

In [38]:
modelo.feature_importances_

array([0.02652423, 0.0147111 , 0.02266181, 0.03746227, 0.00541837,
       0.02232471, 0.02158999, 0.80825394, 0.04105362], dtype=float32)

In [39]:
atributos = list(X.columns.values)
print(atributos)

['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']


In [40]:
modelo.feature_importances_

array([0.02652423, 0.0147111 , 0.02266181, 0.03746227, 0.00541837,
       0.02232471, 0.02158999, 0.80825394, 0.04105362], dtype=float32)

In [41]:
dfi = pd.DataFrame( {'atributos' : atributos, 'importancia':  modelo.feature_importances_ })
dfi

Unnamed: 0,atributos,importancia
0,RI,0.026524
1,Na,0.014711
2,Mg,0.022662
3,Al,0.037462
4,Si,0.005418
5,K,0.022325
6,Ca,0.02159
7,Ba,0.808254
8,Fe,0.041054


In [42]:
dfo = dfi.sort_values(by='importancia', ascending=False).reset_index(drop=True)
dfo

Unnamed: 0,atributos,importancia
0,Ba,0.808254
1,Fe,0.041054
2,Al,0.037462
3,RI,0.026524
4,Mg,0.022662
5,K,0.022325
6,Ca,0.02159
7,Na,0.014711
8,Si,0.005418


In [43]:
atributos_importantes = dfo[:6]
atributos_importantes 

Unnamed: 0,atributos,importancia
0,Ba,0.808254
1,Fe,0.041054
2,Al,0.037462
3,RI,0.026524
4,Mg,0.022662
5,K,0.022325


In [44]:
atributos_mais_importantes = list(atributos_importantes.atributos)
atributos_mais_importantes 

['Ba', 'Fe', 'Al', 'RI', 'Mg', 'K']

# Testando modelo

In [45]:
X.tail()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
209,1.51623,14.14,0.0,2.88,72.61,0.08,9.18,1.06,0.0
210,1.51685,14.92,0.0,1.99,73.06,0.0,8.4,1.59,0.0
211,1.52065,14.36,0.0,2.02,73.42,0.0,8.44,1.64,0.0
212,1.51651,14.38,0.0,1.94,73.61,0.0,8.48,1.57,0.0
213,1.51711,14.23,0.0,2.08,73.36,0.0,8.62,1.67,0.0


In [66]:
x1 = X[209:].values
x1

array([[ 1.51623, 14.14   ,  0.     ,  2.88   , 72.61   ,  0.08   ,
         9.18   ,  1.06   ,  0.     ],
       [ 1.51685, 14.92   ,  0.     ,  1.99   , 73.06   ,  0.     ,
         8.4    ,  1.59   ,  0.     ],
       [ 1.52065, 14.36   ,  0.     ,  2.02   , 73.42   ,  0.     ,
         8.44   ,  1.64   ,  0.     ],
       [ 1.51651, 14.38   ,  0.     ,  1.94   , 73.61   ,  0.     ,
         8.48   ,  1.57   ,  0.     ],
       [ 1.51711, 14.23   ,  0.     ,  2.08   , 73.36   ,  0.     ,
         8.62   ,  1.67   ,  0.     ]])

In [67]:
x1_XGBoost = modelo.predict(x1)
x1_XGBoost

array([5, 5, 5, 5, 5], dtype=int64)

In [68]:
list(map(converte_clasficacao_XGBoost_para_classe, x1_XGBoost)) 

[7, 7, 7, 7, 7]

In [69]:
Y[209:] #Verificando resultados da "predição" do modelo

Unnamed: 0,Type_of_glass
209,7
210,7
211,7
212,7
213,7


# Deploy da Parte1 
## Salvar o modelo treinado

In [70]:
import pickle as pkl

filehandler = open("ModeloXGBoostVidro.pkl" ,"wb")
pkl.dump(modelo,filehandler)
filehandler.close()

!dir -lah *.pkl

 O volume na unidade C nÆo tem nome.
 O N£mero de S‚rie do Volume ‚ 1C6C-FE41

 Pasta de C:\Users\Arthur\Desktop\MeuRepositorio\Data\Projeto Glass Identidication


 Pasta de C:\Users\Arthur\Desktop\MeuRepositorio\Data\Projeto Glass Identidication

08/11/2023  12:55           886.872 ModeloXGBoostVidro.pkl
               1 arquivo(s)        886.872 bytes
               0 pasta(s)   17.495.638.016 bytes dispon¡veis


# Carregar o modelo treinado

In [71]:
file = "ModeloXGBoostVidro.pkl"
with open(file, 'rb') as f:
    modelo_lido = pkl.load(f)
    
modelo_lido