# Variáveis Categóricas e Separação em Teste e Treino

## Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Carregando Base de Dados

In [2]:
!git clone https://github.com/Crissky/MLUD.git

fatal: destination path 'MLUD' already exists and is not an empty directory.


In [3]:
dataset = pd.read_csv('MLUD/Aula 02/admission.csv', delimiter=';')

## Visualizando os Dados

In [4]:
dataset.head()

Unnamed: 0,Name,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Approval
0,Lucas,337,118,4,4.5,4.5,9.65,1,1
1,Ana,324,107,4,4.0,4.5,8.87,1,1
2,Jose,316,104,3,3.0,3.5,8.0,1,1
3,Carlos,322,110,3,3.5,2.5,8.67,1,1
4,Zileide,314,103,2,2.0,3.0,8.21,0,0


In [5]:
X = dataset.iloc[:,:-1].values        #Pegando somente as variáveis independentes

print(X)

[['Lucas' 337 118 4 4.5 4.5 9.65 1]
 ['Ana' 324 107 4 4.0 4.5 8.87 1]
 ['Jose' 316 104 3 3.0 3.5 8.0 1]
 ['Carlos' 322 110 3 3.5 2.5 8.67 1]
 ['Zileide' 314 103 2 2.0 3.0 8.21 0]
 ['Joana' 330 115 5 4.5 3.0 9.34 1]
 ['Davi' 321 109 3 3.0 4.0 8.2 1]
 ['Daniel' 308 101 2 3.0 4.0 7.9 0]
 ['Marcelo' 302 102 1 2.0 1.5 8.0 0]]


In [6]:
y = dataset.iloc[:,-1].values        #Pegando somente as variáveis dependentes

print(y)

[1 1 1 1 0 1 1 0 0]


## Transformando os Nomes em Números

In [7]:
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])

print(X)

[[6 337 118 4 4.5 4.5 9.65 1]
 [0 324 107 4 4.0 4.5 8.87 1]
 [5 316 104 3 3.0 3.5 8.0 1]
 [1 322 110 3 3.5 2.5 8.67 1]
 [8 314 103 2 2.0 3.0 8.21 0]
 [4 330 115 5 4.5 3.0 9.34 1]
 [3 321 109 3 3.0 4.0 8.2 1]
 [2 308 101 2 3.0 4.0 7.9 0]
 [7 302 102 1 2.0 1.5 8.0 0]]


### Transformando os Labels Numéricos para One-Hot Encoding

One-Hot Encoding: uma lista de zeros e uns para que os Labels (Numéricos) não influenciem o treinamento.

In [8]:
D = pd.get_dummies(X[:,0])        # Criando o One-Hot Encoding com o Pandas

D

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1
5,0,0,0,0,1,0,0,0,0
6,0,0,0,1,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0


In [9]:
X = X[:, 1:]                                # Retirando os Labels Numéricos
X = np.insert(X, 0, D.values, axis=1)       # Inserindo o One-Hot Encoding em X

X

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 337, 118, 4, 4.5, 4.5, 9.65, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 324, 107, 4, 4.0, 4.5, 8.87, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 316, 104, 3, 3.0, 3.5, 8.0, 1],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 322, 110, 3, 3.5, 2.5, 8.67, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 314, 103, 2, 2.0, 3.0, 8.21, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 330, 115, 5, 4.5, 3.0, 9.34, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 321, 109, 3, 3.0, 4.0, 8.2, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 308, 101, 2, 3.0, 4.0, 7.9, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 302, 102, 1, 2.0, 1.5, 8.0, 0]],
      dtype=object)

## Dividindo a Base de Dados em Treino e Teste

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
print(X_train)

[[0 0 0 1 0 0 0 0 0 324 107 4 4.0 4.5 8.87 1]
 [1 0 0 0 0 0 0 0 0 321 109 3 3.0 4.0 8.2 1]
 [0 0 0 0 0 0 0 1 0 316 104 3 3.0 3.5 8.0 1]
 [0 1 0 0 0 0 0 0 0 337 118 4 4.5 4.5 9.65 1]
 [0 0 0 0 0 0 1 0 0 322 110 3 3.5 2.5 8.67 1]
 [0 0 1 0 0 0 0 0 0 330 115 5 4.5 3.0 9.34 1]
 [0 0 0 0 1 0 0 0 0 302 102 1 2.0 1.5 8.0 0]]


In [12]:
print(y_train)

[1 1 1 1 1 1 0]
