# LabelEncoder

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

In [2]:
le = LabelEncoder()
le.fit([1,7,7,6])

LabelEncoder()

In [3]:
le.classes_

array([1, 6, 7])

In [4]:
le.transform([1,1,7,6])

array([0, 0, 2, 1], dtype=int64)

In [5]:
le.inverse_transform([0,0,1,2])

array([1, 1, 6, 7])

### Usando texto

In [6]:
le = LabelEncoder()
le.fit(['Tokio', 'Paris', 'Berlin', 'Paris', 'Amsterdam'])

LabelEncoder()

In [7]:
list(le.classes_)

['Amsterdam', 'Berlin', 'Paris', 'Tokio']

In [8]:
le.transform(['Tokio', 'Tokio', 'Amsterdam', 'Paris', 'Berlin'])

array([3, 3, 0, 2, 1], dtype=int64)

### Ejemplo 

In [9]:
df = pd.read_excel('./../../dataset/xlsx/spreadsheet1.xlsx', sheet_name='LabelEncoder')
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [10]:
le.fit([df['Country']])

ValueError: bad input shape (1, 10)

In [None]:
raw_data = {'patient': [1, 1, 1, 2, 2],
        'obs': [1, 2, 3, 1, 2],
        'treatment': [0, 1, 0, 1, 0],
        'score': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])

In [None]:
le.fit(df['score'])

In [None]:
list(le.classes_)

In [None]:
df['score'] = le.transform(df['score'])

In [None]:
df

### OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore')
X = [['Male', 1], ['Female', 3], ['Female', 2]]
encoder.fit(X)

In [None]:
encoder.categories_

In [None]:
encoder.transform([['Female', 1], ['Male', 4]]).toarray()

In [None]:
encoder.inverse_transform([[0,1,1,0,0],[0,0,0,1,0]])

In [None]:
encoder.get_feature_names()

In [None]:
le = LabelEncoder()
integer_encoded = le.fit_transform(['Barcelona', 'Paris', 'Monaco', 'Barcelona'])
integer_encoded

In [None]:
encoder = OneHotEncoder(sparse=False)

In [None]:
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
integer_encoded

In [None]:
encoder = encoder.fit_transform(integer_encoded)

In [None]:
encoder

#### La lista de etiquetas es: ['Barcelona', 'Paris', 'Monaco', 'Barcelona']
#### Las categorias son: 'Barcelona', 'Monaco', 'Paris'
array(

       [
       
           [1., 0., 0.], ==> 'Barcelona' = 1 , 0, 0 => para la primera entrada (Barcelona)

           [0., 0., 1.], ==> 'Paris' = 0 , 0, 1 => para la segunda entrada (Paris)

           [0., 1., 0.], ==> 'Monaco' = 0 , 1, 0 => para la tercera entrada (Monaco)

           [1., 0., 0.] ==> 'Barcelona' = 1 , 0, 0 => para la cuarta entrada (Barcelona)
       ]
   ) 

### Imputer

- #### missing_values: Placeholder for the missing values, all occurrences of missing values will be imputed, integer or NaN (default="NaN")
- #### strategy: (default="mean") 
    - "mean"
    - "median"
    - "most_frequent"
- #### axis: The axis along which to impute
    - 0: columns
    - 1: rows
- #### copy: If True a copy of X will be created, If False, imputation will be done in-place    
    

In [4]:
from sklearn.impute import SimpleImputer
from numpy import nan

In [5]:
SimpleImputer?

[1;31mInit signature:[0m
[0mSimpleImputer[0m[1;33m([0m[1;33m
[0m    [0mmissing_values[0m[1;33m=[0m[0mnan[0m[1;33m,[0m[1;33m
[0m    [0mstrategy[0m[1;33m=[0m[1;34m'mean'[0m[1;33m,[0m[1;33m
[0m    [0mfill_value[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mverbose[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mcopy[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Imputation transformer for completing missing values.

Read more in the :ref:`User Guide <impute>`.

Parameters
----------
missing_values : number, string, np.nan (default) or None
    The placeholder for the missing values. All occurrences of
    `missing_values` will be imputed.

strategy : string, optional (default="mean")
    The imputation strategy.

    - If "mean", then replace missing values using the mean along
      each column. Can only be used with numeric data.
    - If "median", then replac

In [None]:
X = np.array([
    [1, nan, 500],
    [3, 6, nan],
    [14, 0, 303],
    [nan, 0, 211]
])
X

In [None]:
imputer = Imputer(strategy='mean')
X2 = imputer.fit_transform(X)

In [None]:
X2

### Pipelines

**_Sequentially apply a list of transforms and a final estimator. Intermediate steps of pipeline must implement fit and transform methods and the final estimator only needs to implement fit.**

#### Loan prediction problem

In [None]:
train = pd.read_csv('./../../practice/dataset/loan_prediction/train.csv')
test = pd.read_csv('./../../practice/dataset/loan_prediction/test.csv')

In [None]:
train.head(2)

In [None]:
# drop the loan_ID column
train = train.drop('Loan_ID', axis=1)
train.dtypes

- **Here are both categorical and numerical variables so as a minimum, is necessary to apply a one hot encoding
and some sort of scaler.** 

- **Before building the pipeline the test data is splitted into train and test set, so the performance of the model can be validated**

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

- **The first step in building a pipeline is to define each transformer type.**  
- **The convention is to create transformers for different variable types**

**Numeric transformer applies a standardScaler and SimpleImputer to fill missing values**

**Categorical transformer laso has SimpleImputer and OneHotEncoder to transform categorical values into integers.**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

**Use the ColumnTransformer to apply the transformations to the correct columns in the dataframe**

In [None]:
from sklearn.compose import ColumnTransformer

numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

**Fitting the classifier: the next step is to create a pipeline that combines the preprocessor created aboce with a classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

**call the fit method on the raw data and the preprocessing step will be applied followed by training the classifier**

In [None]:
rf.fit(X_train, y_train)

**To predict new data it is as simple as calling the predict method and the preprocessing steps will be applied followed by the prediction**

In [None]:
y_pred = rf.predict(X_test)

### Example using make_pipeline

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
pipeline = make_pipeline(
    Imputer(strategy='mean'),
    StandardScaler()
)

In [None]:
X = np.array([[ nan, 0, 98 ],
              [3, 7, 99 ], 
              [3, 5, 210 ], 
              [4, nan,202 ], 
              [8, 8, 101 ]])
X

In [None]:
pipeline.fit(X)

In [None]:
pipeline.fit_transform(X)

### Detección de outliers

In [None]:
import seaborn as sns

In [None]:
sns.set(style='whitegrid')
tips = sns.load_dataset('tips')
ax = sns.boxplot(x=tips['total_bill'])

#### Obtener los percentiles 75 y 25 (quartiles 1 y 3) 

In [None]:
p75, p25 = np.percentile(tips.total_bill, [75, 25])

**Cálculo del rango intercuartílico**

In [None]:
iqr = p75 - p25

**Cálculo de los límites superior e inferior, datos fuera de estos límites serán datos extremos (outliers)**

In [None]:
min = p25 - (iqr * 1.5)
max = p75 + (iqr * 1.5)

**Valores por encima del rango máximo**

In [None]:
tips.total_bill[tips.total_bill > max]

**Valores por debajo del rango mínimo**

In [None]:
tips.total_bill[tips.total_bill < min]