# Pipeline usando Scikit-Learn

Usar as mesma mudanças feitas da atividades de modo manual, no **01_0E**, mas usando o Scikit-Learn.

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer


df = pd.read_csv("https://raw.githubusercontent.com/She-Codes-Now/Intro-to-Data-Science-with-R/master/food_coded.csv")

df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   GPA                           123 non-null    object 
 1   Gender                        125 non-null    int64  
 2   breakfast                     125 non-null    int64  
 3   calories_chicken              125 non-null    int64  
 4   calories_day                  106 non-null    float64
 5   calories_scone                124 non-null    float64
 6   coffee                        125 non-null    int64  
 7   comfort_food                  124 non-null    object 
 8   comfort_food_reasons          123 non-null    object 
 9   comfort_food_reasons_coded    106 non-null    float64
 10  cook                          122 non-null    float64
 11  comfort_food_reasons_coded.1  125 non-null    int64  
 12  cuisine                       108 non-null    float64
 13  diet_

Unnamed: 0,GPA,Gender,breakfast,calories_chicken,calories_day,calories_scone,coffee,comfort_food,comfort_food_reasons,comfort_food_reasons_coded,...,soup,sports,thai_food,tortilla_calories,turkey_calories,type_sports,veggies_day,vitamins,waffle_calories,weight
0,2.4,2,1,430,,315.0,1,none,we dont have comfort,9.0,...,1.0,1.0,1,1165.0,345,car racing,5,1,1315,187
1,3.654,1,1,610,3.0,420.0,2,"chocolate, chips, ice cream","Stress, bored, anger",1.0,...,1.0,1.0,2,725.0,690,Basketball,4,2,900,155
2,3.3,1,1,720,4.0,420.0,2,"frozen yogurt, pizza, fast food","stress, sadness",1.0,...,1.0,2.0,5,1165.0,500,none,5,1,900,I'm not answering this.
3,3.2,1,1,430,3.0,420.0,2,"Pizza, Mac and cheese, ice cream",Boredom,2.0,...,1.0,2.0,5,725.0,690,,3,1,1315,"Not sure, 240"
4,3.5,1,1,720,2.0,420.0,2,"Ice cream, chocolate, chips","Stress, boredom, cravings",1.0,...,1.0,1.0,4,940.0,500,Softball,4,2,760,190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,3.5,1,1,610,4.0,420.0,2,"wine. mac and cheese, pizza, ice cream",boredom and sadness,,...,1.0,1.0,5,940.0,500,Softball,5,1,1315,156
121,3,1,1,265,2.0,315.0,2,Pizza / Wings / Cheesecake,Loneliness / Homesick / Sadness,,...,1.0,,4,940.0,500,basketball,5,2,1315,180
122,3.882,1,1,720,,420.0,1,"rice, potato, seaweed soup",sadness,,...,1.0,2.0,5,580.0,690,none,4,2,1315,120
123,3,2,1,720,4.0,420.0,1,"Mac n Cheese, Lasagna, Pizza","happiness, they are some of my favorite foods",,...,2.0,2.0,1,940.0,500,,3,1,1315,135


## Detectar os dados faltantes
Aqui foi verificada a quantidade de dados faltantes em cada coluna, afim de servir de comparação após o us da biblioteca.

In [2]:
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

GPA                            2
calories_day                  19
calories_scone                 1
comfort_food                   1
comfort_food_reasons           2
comfort_food_reasons_coded    19
cook                           3
cuisine                       17
diet_current                   1
drink                          2
eating_changes                 3
employment                     9
exercise                      13
father_education               1
father_profession              3
fav_cuisine                    2
fav_food                       2
food_childhood                 1
healthy_meal                   1
ideal_diet                     1
income                         1
life_rewarding                 1
marital_status                 1
meals_dinner_friend            3
mother_education               3
mother_profession              2
on_off_campus                  1
persian_food                   1
self_perception_weight         1
soup                           1
sports    

## Tratar os dados faltantes

Usar o **SimpleImputer** para preencher os valores faltantes e inconsistências, com a estratégia *mean* permite preencher valores ausentes mantendo a distribuição original dos dados 

In [3]:
# Imputar, trocando valores faltantes pela média
imputer = SimpleImputer(strategy='mean')
df['calories_day'] = imputer.fit_transform(df[['calories_day']])
# Retirar as incosistências
df['calories_day'] = np.where(df['calories_day'] < 0, df['calories_day'].mean(), df['calories_day'])


Mas nesse ritmo pode-se demorar muito, então vamos para o pipeline com todas as colunas, aplicando tanto para as colunas numéricas e categóricas.

In [4]:
numeric_features = [
    'calories_day', 'calories_scone', 'comfort_food_reasons_coded', 'cook', 
    'cuisine', 'drink', 'employment', 'exercise', 'father_education', 
    'fav_food', 'income', 'life_rewarding', 'marital_status', 'mother_education', 
    'on_off_campus', 'persian_food', 'self_perception_weight', 'soup', 
    'sports', 'tortilla_calories', 'veggies_day', 'waffle_calories'
]

categorical_features = [
    'Gender', 'breakfast', 'calories_chicken', 'coffee', 'comfort_food', 
    'comfort_food_reasons', 'diet_current', 'diet_current_coded', 'eating_changes', 
    'eating_changes_coded', 'eating_changes_coded1', 'eating_out', 'ethnic_food', 
    'father_profession', 'fav_cuisine', 'fav_cuisine_coded', 'food_childhood', 
    'fries', 'fruit_day', 'grade_level', 'greek_food', 'healthy_feeling', 
    'healthy_meal', 'ideal_diet', 'ideal_diet_coded', 'indian_food', 'italian_food', 
    'meals_dinner_friend', 'mother_profession', 'nutritional_check', 'parents_cook', 
    'pay_meal_out', 'soup', 'sports', 'thai_food', 'type_sports', 'veggies_day', 
    'vitamins', 'weight', "GPA"
]

Nesse caso, usamos o **SimpleImputer** para imputar os dados que estão faltando, para as colunas numéricas trocar os dados faltantes pela média (*mean*) e para os dados categóricos pela moda (*most_frequent*).

Em uma parte da célula, foi necessário um tratamento diferente para as colunas *weight* e o *GPA* além de outras mudanças como a tranformação de arrendondamento das casas decimais. A troca pelas métricas escolhidas para os valors faltantes foi na mesma ideia que a limpeza manual feita apenas com o pandas e numpy.


In [5]:
# Imputar valores faltantes nas colunas numéricas
imputer_num = SimpleImputer(strategy='mean')
df[numeric_features] = imputer_num.fit_transform(df[numeric_features].apply(pd.to_numeric, errors='coerce'))

# Imputar valores faltantes nas colunas categóricas
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_features] = imputer_cat.fit_transform(df[categorical_features])

# Tratar a coluna 'weight'
df['weight'] = df['weight'].str.extract(r'(\d+)').astype(float)
media_weight = df['weight'].mean()
media_weight_rounded = round(media_weight, 0)
df['weight'] = df['weight'].fillna(media_weight_rounded)

# Tratar a coluna 'GPA'
df['GPA'] = df['GPA'].astype(str).str.extract(r'(\d+)').astype(float)
media_gpa = df['GPA'].mean()
df['GPA'] = df['GPA'].fillna(media_gpa)

# Arredondar colunas específicas para 1 casa decimal
df['cuisine'] = df['cuisine'].round(1)
df['GPA'] = df['GPA'].round(1)
df['calories_day'] = df['calories_day'].round(1)
df['drink'] = df['drink'].round(1)
df['comfort_food_reasons_coded'] = df['comfort_food_reasons_coded'].round(1)
df['sports'] = df['sports'].round(1)

# Converter a coluna 'cook' para int
df['cook'] = df['cook'].astype(int)


Verificação da contagem de faltantes após a limpeza.

In [6]:
# Verificando a quantidade de valores faltantes após o preprocessamento
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)


# Dataset Final

In [7]:
# Salvar os dados transformados
df.to_csv(r'C:\\Users\\elyss\\OneDrive\\Área de Trabalho\\Mineracao_de_Dados\\Dados\\food_coded_limpeza_scikit.csv', index=False)
df

Unnamed: 0,GPA,Gender,breakfast,calories_chicken,calories_day,calories_scone,coffee,comfort_food,comfort_food_reasons,comfort_food_reasons_coded,...,soup,sports,thai_food,tortilla_calories,turkey_calories,type_sports,veggies_day,vitamins,waffle_calories,weight
0,2.0,2,1,430,3.0,315.0,1,none,we dont have comfort,9.0,...,1.0,1.0,1,1165.0,345,car racing,5.0,1,1315.0,187.0
1,3.0,1,1,610,3.0,420.0,2,"chocolate, chips, ice cream","Stress, bored, anger",1.0,...,1.0,1.0,2,725.0,690,Basketball,4.0,2,900.0,155.0
2,3.0,1,1,720,4.0,420.0,2,"frozen yogurt, pizza, fast food","stress, sadness",1.0,...,1.0,2.0,5,1165.0,500,none,5.0,1,900.0,159.0
3,3.0,1,1,430,3.0,420.0,2,"Pizza, Mac and cheese, ice cream",Boredom,2.0,...,1.0,2.0,5,725.0,690,none,3.0,1,1315.0,240.0
4,3.0,1,1,720,2.0,420.0,2,"Ice cream, chocolate, chips","Stress, boredom, cravings",1.0,...,1.0,1.0,4,940.0,500,Softball,4.0,2,760.0,190.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,3.0,1,1,610,4.0,420.0,2,"wine. mac and cheese, pizza, ice cream",boredom and sadness,2.7,...,1.0,1.0,5,940.0,500,Softball,5.0,1,1315.0,156.0
121,3.0,1,1,265,2.0,315.0,2,Pizza / Wings / Cheesecake,Loneliness / Homesick / Sadness,2.7,...,1.0,1.390244,4,940.0,500,basketball,5.0,2,1315.0,180.0
122,3.0,1,1,720,3.0,420.0,1,"rice, potato, seaweed soup",sadness,2.7,...,1.0,2.0,5,580.0,690,none,4.0,2,1315.0,120.0
123,3.0,2,1,720,4.0,420.0,1,"Mac n Cheese, Lasagna, Pizza","happiness, they are some of my favorite foods",2.7,...,2.0,2.0,1,940.0,500,none,3.0,1,1315.0,135.0
