<a href="https://colab.research.google.com/github/AlexeiAltamira/Portfolio/blob/main/Estimativa_de_n%C3%ADveis_de_obesidade.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalação das bibliotécas

In [38]:
# Importar principais bibliotecas para auxiliar na leitura, manipulação, tratamento e análise dos dados
import pandas as pd

In [39]:
from sklearn.preprocessing import LabelEncoder # Classe para transformar rótulos categóricos em números inteiros.
from sklearn.model_selection import train_test_split # Função para dividir um conjunto de dados em conjuntos de treinamento e teste.
from sklearn.linear_model import LogisticRegression # Classe para realizar regressão logística.
from sklearn.ensemble import RandomForestClassifier  # Classe para construir um modelo de classificação usando um conjunto de árvores de decisão aleatórias.
from sklearn.ensemble import GradientBoostingClassifier # Classe para construir um modelo de classificação usando a técnica de boosting com árvores de decisão.
from sklearn.tree import DecisionTreeClassifier # Classe para construir um modelo de classificação baseado em uma árvore de decisão.
from sklearn.svm import SVC # Classe para construir um modelo de classificação ou regressão usando máquinas de vetores de suporte.
from sklearn.neighbors import KNeighborsClassifier # Classe para construir um modelo de classificação baseado no algoritmo k-vizinhos mais próximos (KNN).
from sklearn.neural_network import MLPClassifier # Classe para construir um modelo de classificação usando uma rede neural perceptron multicamada (MLP).
from sklearn.naive_bayes import MultinomialNB # Classe para construir um modelo de classificação usando o algoritmo Naive Bayes multinomial.
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Função para calcular a precisão de um modelo de classificação.

In [40]:
pip install ucimlrepo



# Carregar arquivo

In [41]:
# Origem dos dados: https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition

# Baixar o repositório com os dados a serem analisados
from ucimlrepo import fetch_ucirepo

#  Buscar conjunto de dados
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544)

# Dados (pandas dataframes)
features = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features
targets = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets


# Análise dos dados

In [42]:
#Transformar os dados divididos da base em um único dataframe
junto = pd.concat([features, targets], axis=1)

In [43]:
# Verificar se o arquivo a ser analisado esta correto. Podendo ser visualizado as 3 linhas iniciais.
junto.head(3)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight


In [44]:
# Verificar e conta se há dados nulos em todas as colunas.
junto.isna().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [45]:
# Verificar se há alguma divergência, principalmente no Dtype dos dados.
junto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [46]:
# Verificar visualmente se há algum outliers
junto.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [47]:
# Verificar se há linhas com valores duplicados
junto.duplicated().sum()

24

# Pré-processamento

In [48]:
# Deixo na coluna 'Age' somente valores inteiros
junto['Age'] = junto['Age'].astype(int)

In [49]:
# Mostrar as linhas com valores duplicados
junto[junto.duplicated()]

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
98,Female,21,1.52,42.0,no,no,3.0,1.0,Frequently,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight
106,Female,25,1.57,55.0,no,yes,2.0,1.0,Sometimes,no,2.0,no,2.0,0.0,Sometimes,Public_Transportation,Normal_Weight
174,Male,21,1.62,70.0,no,yes,2.0,1.0,no,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation,Overweight_Level_I
179,Male,21,1.62,70.0,no,yes,2.0,1.0,no,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation,Overweight_Level_I
184,Male,21,1.62,70.0,no,yes,2.0,1.0,no,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation,Overweight_Level_I
209,Female,22,1.69,65.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,Public_Transportation,Normal_Weight
309,Female,16,1.66,58.0,no,no,2.0,1.0,Sometimes,no,1.0,no,0.0,1.0,no,Walking,Normal_Weight
460,Female,18,1.62,55.0,yes,yes,2.0,3.0,Frequently,no,1.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight
467,Male,22,1.74,75.0,yes,yes,3.0,3.0,Frequently,no,1.0,no,1.0,0.0,no,Automobile,Normal_Weight
496,Male,18,1.72,53.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,2.0,Sometimes,Public_Transportation,Insufficient_Weight


In [50]:
# Exemplo de linhas com dados duplicados
junto[(junto['Age'] == 21) & (junto['Height'] == 1.52) & (junto['Weight'] == 42.0)]

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
97,Female,21,1.52,42.0,no,no,3.0,1.0,Frequently,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight
98,Female,21,1.52,42.0,no,no,3.0,1.0,Frequently,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight
523,Female,21,1.52,42.0,no,yes,3.0,1.0,Frequently,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight
527,Female,21,1.52,42.0,no,yes,3.0,1.0,Frequently,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight
659,Female,21,1.52,42.0,no,yes,3.0,1.0,Frequently,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight
663,Female,21,1.52,42.0,no,yes,3.0,1.0,Frequently,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight


In [51]:
# Elimina as linhas com dados duplicados
junto = junto.drop_duplicates()

In [52]:
# Usa o Label Encoder para transformar rótulos categóricos em números.
le = LabelEncoder()
junto['Gender'] = le.fit_transform(junto['Gender'])
junto['family_history_with_overweight'] = le.fit_transform(junto['family_history_with_overweight'])
junto['FAVC'] = le.fit_transform(junto['FAVC'])
junto['SMOKE'] = le.fit_transform(junto['SMOKE'])
junto['SCC'] = le.fit_transform(junto['SCC'])
junto['NObeyesdad'] = le.fit_transform(junto['NObeyesdad'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  junto['Gender'] = le.fit_transform(junto['Gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  junto['family_history_with_overweight'] = le.fit_transform(junto['family_history_with_overweight'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  junto['FAVC'] = le.fit_transform(junto['FAVC'])
A val

In [53]:
# Transforma os dados categóricos progressivos em valores
junto['CAEC'] = junto['CAEC'].replace({'no': 0, 'Sometimes': 1, 'Frequently': 2,'Always': 3})
junto['CALC'] = junto['CALC'].replace({'no': 0, 'Sometimes': 1, 'Frequently': 2,'Always': 3})
junto['MTRANS'] = junto['MTRANS'].replace({'Bike': 0, 'Walking': 1, 'Public_Transportation': 2, 'Motorbike': 3, 'Automobile': 4})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  junto['CAEC'] = junto['CAEC'].replace({'no': 0, 'Sometimes': 1, 'Frequently': 2,'Always': 3})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  junto['CALC'] = junto['CALC'].replace({'no': 0, 'Sometimes': 1, 'Frequently': 2,'Always': 3})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  junto['MTRANS'] =

# Modelagem e Desenvolvimento

In [54]:
# Realiza separação do dados de treino e teste
X = junto.drop('NObeyesdad', axis=1)
y = junto['NObeyesdad']

In [55]:
# Realiza o treinamento e teste dos dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
# Aprendizado de máquina utilizado para realizar predições
xgb = GradientBoostingClassifier()


In [57]:
# Após testes a tecnica de Gradient Boosting Classifier se mostrou o melhor em prever a estimativa
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
score = accuracy_score(pred, y_test)
print('Gradient Boosting Classifier', ": {}%".format(round(score*100, 2)))
report = classification_report(pred, y_test)
print(report)

Gradient Boosting Classifier : 95.45%
              precision    recall  f1-score   support

           0       0.92      0.95      0.93        57
           1       0.87      0.87      0.87        61
           2       0.99      0.99      0.99        70
           3       0.98      1.00      0.99        63
           4       1.00      1.00      1.00        60
           5       0.93      0.91      0.92        56
           6       1.00      0.96      0.98        51

    accuracy                           0.95       418
   macro avg       0.95      0.95      0.95       418
weighted avg       0.95      0.95      0.95       418



In [58]:
# Cria função 'machine' que facilita na repetição de parte do código
def machine(nome, modelo):
  modelo.fit(X_train, y_train)
  pred = modelo.predict(X_test)
  score = accuracy_score(pred, y_test)
  print(f'{nome} {round(score*100, 2)}')
  report = classification_report(pred, y_test)
  print(report)

log = LogisticRegression()
dtr = DecisionTreeClassifier()
rfc = RandomForestClassifier()
svm = SVC(C=0.9)
knn = KNeighborsClassifier(n_neighbors=1)
red = MLPClassifier(hidden_layer_sizes=(100, 50, 20),activation='logistic')
mnb = MultinomialNB()
gbm = GradientBoostingClassifier()

In [59]:
# Celulas a baixo colocadas em ordem de maior porcentagem de acerto na estimetiva

In [60]:
# Novamente o Gradient Boosting Classifier
machine('Gradient Boosting Machines (GBM)', gbm)

Gradient Boosting Machines (GBM) 95.93
              precision    recall  f1-score   support

           0       0.92      0.95      0.93        57
           1       0.87      0.88      0.88        60
           2       1.00      0.99      0.99        71
           3       0.98      1.00      0.99        63
           4       1.00      1.00      1.00        60
           5       0.95      0.91      0.93        57
           6       1.00      0.98      0.99        50

    accuracy                           0.96       418
   macro avg       0.96      0.96      0.96       418
weighted avg       0.96      0.96      0.96       418



In [61]:
# Floresta Aleatória
machine('Random Forests',rfc)

Random Forests 95.45
              precision    recall  f1-score   support

           0       0.95      0.98      0.97        57
           1       0.93      0.86      0.90        66
           2       0.94      0.99      0.96        67
           3       1.00      1.00      1.00        64
           4       1.00      1.00      1.00        60
           5       0.89      0.92      0.91        53
           6       0.96      0.92      0.94        51

    accuracy                           0.95       418
   macro avg       0.95      0.95      0.95       418
weighted avg       0.95      0.95      0.95       418



In [62]:
# Árvores de Decisão
machine('Árvores de Decisão', dtr)

Árvores de Decisão 91.63
              precision    recall  f1-score   support

           0       0.93      0.87      0.90        63
           1       0.69      0.84      0.76        50
           2       0.96      0.94      0.95        71
           3       0.98      1.00      0.99        63
           4       1.00      0.98      0.99        61
           5       0.89      0.82      0.85        60
           6       0.96      0.94      0.95        50

    accuracy                           0.92       418
   macro avg       0.92      0.91      0.91       418
weighted avg       0.92      0.92      0.92       418



In [63]:
# K — Nearest Neighbors (K-vizinhos mais próximos)
machine('KNN (K-Nearest Neighbors)', knn)

KNN (K-Nearest Neighbors) 89.71
              precision    recall  f1-score   support

           0       0.98      0.84      0.91        69
           1       0.52      0.91      0.67        35
           2       0.93      0.98      0.96        66
           3       1.00      1.00      1.00        64
           4       1.00      0.98      0.99        61
           5       0.89      0.77      0.82        64
           6       0.96      0.80      0.87        59

    accuracy                           0.90       418
   macro avg       0.90      0.90      0.89       418
weighted avg       0.92      0.90      0.90       418



In [64]:
# Rede neural com Perceptron Multicamadas
machine('Rede neural',red)

Rede neural 84.45
              precision    recall  f1-score   support

           0       0.98      0.82      0.89        71
           1       0.72      0.86      0.79        51
           2       0.86      0.91      0.88        66
           3       1.00      0.98      0.99        65
           4       1.00      1.00      1.00        60
           5       0.64      0.70      0.67        50
           6       0.65      0.58      0.62        55

    accuracy                           0.84       418
   macro avg       0.84      0.84      0.83       418
weighted avg       0.85      0.84      0.85       418





In [65]:
# Regressão Logística
machine('Regressão Logística',log)

Regressão Logística 66.51
              precision    recall  f1-score   support

           0       0.81      0.81      0.81        59
           1       0.46      0.55      0.50        51
           2       0.59      0.56      0.57        73
           3       0.81      0.93      0.87        56
           4       1.00      0.90      0.94        67
           5       0.42      0.47      0.44        49
           6       0.53      0.41      0.46        63

    accuracy                           0.67       418
   macro avg       0.66      0.66      0.66       418
weighted avg       0.67      0.67      0.67       418



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [66]:
# Naive Bayes (tabela de probabilidades)
machine('Multinomial Naive Bayes', mnb)

Multinomial Naive Bayes 60.05
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        64
           1       0.44      0.69      0.54        39
           2       0.40      0.34      0.37        83
           3       0.70      0.73      0.71        62
           4       1.00      0.88      0.94        68
           5       0.31      0.53      0.39        32
           6       0.47      0.33      0.39        70

    accuracy                           0.60       418
   macro avg       0.60      0.61      0.59       418
weighted avg       0.62      0.60      0.60       418



In [67]:
# Máquinas de vetores de suporte
machine('SVM (Support Vector Machines)', svm)

SVM (Support Vector Machines) 51.67
              precision    recall  f1-score   support

           0       0.92      0.66      0.77        82
           1       0.30      0.39      0.34        46
           2       0.37      0.46      0.41        57
           3       0.64      0.64      0.64        64
           4       0.45      0.53      0.49        51
           5       0.33      0.55      0.41        33
           6       0.65      0.38      0.48        85

    accuracy                           0.52       418
   macro avg       0.52      0.51      0.50       418
weighted avg       0.57      0.52      0.53       418

