In [None]:
# Importando as bibliotecas para os modelos
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# One Hot Encoding

## Tarefa #1: Recebendo os dados

In [None]:
# Recebendo os dados

carros = pd.read_csv('Used_fiat_500_in_Italy_dataset.csv', sep = ',')
carros.head()

Unnamed: 0,model,engine_power,transmission,age_in_days,km,previous_owners,lat,lon,price
0,pop,69,manual,4474,56779,2,45.071079,7.46403,4490
1,lounge,69,manual,2708,160000,1,45.069679,7.70492,4500
2,lounge,69,automatic,3470,170000,2,45.514599,9.28434,4500
3,sport,69,manual,3288,132000,2,41.903221,12.49565,4700
4,sport,69,manual,3712,124490,2,45.532661,9.03892,4790


## Tarefa #2: Corrigindo os dados

In [None]:
carros.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   model            380 non-null    object 
 1   engine_power     380 non-null    int64  
 2   transmission     380 non-null    object 
 3   age_in_days      380 non-null    int64  
 4   km               380 non-null    int64  
 5   previous_owners  380 non-null    int64  
 6   lat              380 non-null    float64
 7   lon              380 non-null    float64
 8   price            380 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 26.8+ KB


In [None]:
carros.shape

(380, 9)

Vamos explorar as colunas que são do tipo `object` para aplicarmos o *One Hot Encoding* ou o *Label Encoding*:

In [None]:
carros['model'].unique()

array(['pop', 'lounge', 'sport', 'star'], dtype=object)

In [None]:
carros['transmission'].unique()

array(['manual', 'automatic'], dtype=object)

A coluna model e transmission possuem textos e precisamos corrigir isso

Vamos agora transformar a coluna de transmissão que é uma coluna que possui apenas dois valores possíveis. Para isso, vamos usando o comando o `replace`. Se o carro for manual o valor será substituído por 0 e se o carro for automático o valor será substituído por 1:

In [None]:
carros['transmission'].replace({'manual':0 , 'automatic': 1}, inplace=True)

In [None]:
carros['transmission'].unique()

array([0, 1])

Vamos aplicar o One Hot Enconding na coluna *model* para transformar os textos em colunas:

In [None]:
# Aplicando o One Hot Enconding
modelos = pd.get_dummies(carros["model"], prefix = "modelo")

In [None]:
modelos.head()

Unnamed: 0,modelo_lounge,modelo_pop,modelo_sport,modelo_star
0,0,1,0,0
1,1,0,0,0
2,1,0,0,0
3,0,0,1,0
4,0,0,1,0


Criamos dessa forma 4 colunas novas que são binárias indicando o modelo do veículo. Vamos agora criar um novo `DataFrame` unindo os `DataFrames` carros e transmissao:

In [None]:
# Concatenando os dados:
carros_corrigidos = pd.concat([carros, modelos], axis=1)

In [None]:
# Exibindo o novo Dataframe
carros_corrigidos.head()

Unnamed: 0,model,engine_power,transmission,age_in_days,km,previous_owners,lat,lon,price,modelo_lounge,modelo_pop,modelo_sport,modelo_star
0,pop,69,0,4474,56779,2,45.071079,7.46403,4490,0,1,0,0
1,lounge,69,0,2708,160000,1,45.069679,7.70492,4500,1,0,0,0
2,lounge,69,1,3470,170000,2,45.514599,9.28434,4500,1,0,0,0
3,sport,69,0,3288,132000,2,41.903221,12.49565,4700,0,0,1,0
4,sport,69,0,3712,124490,2,45.532661,9.03892,4790,0,0,1,0


Pensando em um modelo de *Machine Learning*, a coluna *model* pode ser excluída, pois ela não seria usada para treinar o modelo.

In [None]:
carros_corrigidos.drop(columns=['model'], inplace = True)

In [None]:
carros_corrigidos.head()

Unnamed: 0,engine_power,transmission,age_in_days,km,previous_owners,lat,lon,price,modelo_lounge,modelo_pop,modelo_sport,modelo_star
0,69,0,4474,56779,2,45.071079,7.46403,4490,0,1,0,0
1,69,0,2708,160000,1,45.069679,7.70492,4500,1,0,0,0
2,69,1,3470,170000,2,45.514599,9.28434,4500,1,0,0,0
3,69,0,3288,132000,2,41.903221,12.49565,4700,0,0,1,0
4,69,0,3712,124490,2,45.532661,9.03892,4790,0,0,1,0


# Label Encoding

## Tarefa #1: Recebendo os dados

In [None]:
# Recebendo os dados
titanic = pd.read_csv('titanic.csv', sep = ';')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,Dead,Third Class,"Kelly, Mr. James",male,345.0,0,0,330911,78292.0,,Q
1,893,Alive,Third Class,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,Dead,Second Class,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,96875.0,,Q
3,895,Dead,Third Class,"Wirz, Mr. Albert",male,27.0,0,0,315154,86625.0,,S
4,896,Alive,Third Class,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,122875.0,,S


## Tarefa #2: Corrigindo os dados

In [None]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    object 
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 39.3+ KB


Vamos explorar as colunas que são do tipo `object` para aplicarmos *Label Encoding*:

In [None]:
titanic['Survived'].unique()

array(['Dead', 'Alive'], dtype=object)

In [None]:
titanic['Pclass'].unique()

array([3, 2, 1])

Vamos agora aplicar o Label Encoding na coluna Pclass:

In [None]:
titanic['Pclass'].replace({'First Class':1, 'Second Class':2, 'Third Class':3}, inplace = True)

In [None]:
titanic['Pclass'].unique()

array([3, 2, 1])

In [None]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,Dead,3,"Kelly, Mr. James",male,345.0,0,0,330911,78292.0,,Q
1,893,Alive,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,Dead,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,96875.0,,Q
3,895,Dead,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,86625.0,,S
4,896,Alive,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,122875.0,,S
