In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd


# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns


# Estandarización variables numéricas y Codificación variables categóricas
# ------------------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

# Gestión datos desbalanceados
# ------------------------------------------------------------------------------
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek

# Para separar los datos en train y test
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split

#  Gestión de warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


In [2]:
df2 = pd.read_csv("./Datos/tiempo.csv", index_col=0)
df2

Unnamed: 0,index,Date/Time,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather
0,16,1/1/2012 16:00,2.6,-0.2,82,13,12.9,99.93,Mostly Cloudy
1,26,1/2/2012 2:00,3.9,-0.9,71,32,25.0,101.14,Mostly Cloudy
2,27,1/2/2012 3:00,3.7,-1.5,69,33,25.0,101.14,Mostly Cloudy
3,28,1/2/2012 4:00,2.9,-2.3,69,32,25.0,101.14,Mostly Cloudy
4,29,1/2/2012 5:00,2.6,-2.3,70,32,25.0,101.14,Mostly Cloudy
...,...,...,...,...,...,...,...,...,...
4170,8755,12/30/2012 19:00,-13.4,-16.5,77,26,25.0,101.47,Mainly Clear
4171,8757,12/30/2012 21:00,-13.8,-16.5,80,20,25.0,101.50,Mainly Clear
4172,8758,12/30/2012 22:00,-13.7,-16.3,81,19,25.0,101.54,Mainly Clear
4173,8759,12/30/2012 23:00,-12.1,-15.1,78,28,25.0,101.52,Mostly Cloudy


 - Estandarizar las variables numéricas de vuestro set de datos

 - Codificar las variables categóricas. Recordad que tendréis que tener en cuenta si vuestras variables tienen orden o no

 - Chequear si vuestros datos están balanceados. En caso de que no lo estén utilizad algunas de las herramientas aprendidas en la lección para balancearlos.

 - Guardad el dataframe con los cambios que habéis aplicado para utilizarlo en la siguiente lección

In [3]:
# nos hacemos una copia del df que acabamos de cargar

df_copia = df2.copy()

In [4]:
# iniciamos el método para escalar

scaler = StandardScaler()

In [5]:
# recordemos que la estandarización solo se hace para las variables predictoras numéricas

numericas = df_copia.select_dtypes(include = np.number)
numericas.head()

Unnamed: 0,index,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa
0,16,2.6,-0.2,82,13,12.9,99.93
1,26,3.9,-0.9,71,32,25.0,101.14
2,27,3.7,-1.5,69,33,25.0,101.14
3,28,2.9,-2.3,69,32,25.0,101.14
4,29,2.6,-2.3,70,32,25.0,101.14


In [6]:
numericas.drop(["index"], axis = 1, inplace = True)

Estandarizamos las variables numéricas de vuestro set de datos

In [7]:
# ahora ya podemos ajustar nuestros datos.  

scaler.fit(numericas)

# transformamos los datos

X_escaladas = scaler.transform(numericas)

# por último convertiremos el array que nos devuelve en un dataframe. 

numericas_estandar = pd.DataFrame(X_escaladas, columns = numericas.columns)
numericas_estandar.head(2)

Unnamed: 0,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa
0,-0.755162,-0.366931,1.320679,-0.201398,-1.722126,-1.782197
1,-0.645784,-0.43015,0.616199,2.334455,-0.673565,0.024563


In [8]:
# dropeamos las columnas sin estandarizar

df_copia.drop(["Temp_C", "Dew Point Temp_C", "Rel Hum_%", "Wind Speed_km/h", "Visibility_km", "Press_kPa"], axis = 1, inplace=True)
df_copia.head()

Unnamed: 0,index,Date/Time,Weather
0,16,1/1/2012 16:00,Mostly Cloudy
1,26,1/2/2012 2:00,Mostly Cloudy
2,27,1/2/2012 3:00,Mostly Cloudy
3,28,1/2/2012 4:00,Mostly Cloudy
4,29,1/2/2012 5:00,Mostly Cloudy


In [9]:
numericas_estandar

Unnamed: 0,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa
0,-0.755162,-0.366931,1.320679,-0.201398,-1.722126,-1.782197
1,-0.645784,-0.430150,0.616199,2.334455,-0.673565,0.024563
2,-0.662611,-0.484338,0.488112,2.467921,-0.673565,0.024563
3,-0.729921,-0.556589,0.488112,2.334455,-0.673565,0.024563
4,-0.755162,-0.556589,0.552155,2.334455,-0.673565,0.024563
...,...,...,...,...,...,...
4170,-2.101351,-1.839042,1.000461,1.533659,-0.673565,0.517316
4171,-2.135006,-1.839042,1.192591,0.732864,-0.673565,0.562112
4172,-2.126592,-1.820979,1.256635,0.599398,-0.673565,0.621840
4173,-1.991973,-1.712603,1.064504,1.800591,-0.673565,0.591976


In [10]:
# lo unimos al dataframe original 

df_copia = pd.concat([df_copia,numericas_estandar], axis = 1)
# chequeamos que esta todo bien
df_copia

Unnamed: 0,index,Date/Time,Weather,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa
0,16,1/1/2012 16:00,Mostly Cloudy,-0.755162,-0.366931,1.320679,-0.201398,-1.722126,-1.782197
1,26,1/2/2012 2:00,Mostly Cloudy,-0.645784,-0.430150,0.616199,2.334455,-0.673565,0.024563
2,27,1/2/2012 3:00,Mostly Cloudy,-0.662611,-0.484338,0.488112,2.467921,-0.673565,0.024563
3,28,1/2/2012 4:00,Mostly Cloudy,-0.729921,-0.556589,0.488112,2.334455,-0.673565,0.024563
4,29,1/2/2012 5:00,Mostly Cloudy,-0.755162,-0.556589,0.552155,2.334455,-0.673565,0.024563
...,...,...,...,...,...,...,...,...,...
4170,8755,12/30/2012 19:00,Mainly Clear,-2.101351,-1.839042,1.000461,1.533659,-0.673565,0.517316
4171,8757,12/30/2012 21:00,Mainly Clear,-2.135006,-1.839042,1.192591,0.732864,-0.673565,0.562112
4172,8758,12/30/2012 22:00,Mainly Clear,-2.126592,-1.820979,1.256635,0.599398,-0.673565,0.621840
4173,8759,12/30/2012 23:00,Mostly Cloudy,-1.991973,-1.712603,1.064504,1.800591,-0.673565,0.591976


Codificamos las variables categóricas. Recordad que tendréis que tener en cuenta si vuestras variables tienen orden o no

In [11]:
df_copia.head(2)

Unnamed: 0,index,Date/Time,Weather,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa
0,16,1/1/2012 16:00,Mostly Cloudy,-0.755162,-0.366931,1.320679,-0.201398,-1.722126,-1.782197
1,26,1/2/2012 2:00,Mostly Cloudy,-0.645784,-0.43015,0.616199,2.334455,-0.673565,0.024563


In [12]:
df_copia.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4175 entries, 0 to 4174
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             4175 non-null   int64  
 1   Date/Time         4175 non-null   object 
 2   Weather           4175 non-null   object 
 3   Temp_C            4175 non-null   float64
 4   Dew Point Temp_C  4175 non-null   float64
 5   Rel Hum_%         4175 non-null   float64
 6   Wind Speed_km/h   4175 non-null   float64
 7   Visibility_km     4175 non-null   float64
 8   Press_kPa         4175 non-null   float64
dtypes: float64(6), int64(1), object(2)
memory usage: 326.2+ KB


Variables categóricas

 - Date/Time (Tiene Orden)

 - Weather (No tiene Orden)

 Empezamos con "Weather" que no tiene orden

In [20]:
lista_columnas = ["Weather"]

df_copia_encoded = pd.DataFrame()


for columna in lista_columnas:
    df_dummies = pd.get_dummies(df_copia[columna], prefix_sep = "_", prefix = columna, dtype = int)

    df_encoded = pd.concat([df_copia_encoded, df_dummies], axis = 1)


In [14]:
# ya tenemos nuestro dataframe con las variables codificadas,
df_encoded.head()

Unnamed: 0,Weather_Mainly Clear,Weather_Mostly Cloudy
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [26]:
df_sin_esta = pd.concat([numericas, df_encoded], axis = 1)
df_sin_esta.head()

Unnamed: 0,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather_Mainly Clear,Weather_Mostly Cloudy
0,2.6,-0.2,82,13,12.9,99.93,0,1
1,3.9,-0.9,71,32,25.0,101.14,0,1
2,3.7,-1.5,69,33,25.0,101.14,0,1
3,2.9,-2.3,69,32,25.0,101.14,0,1
4,2.6,-2.3,70,32,25.0,101.14,0,1


In [27]:
df_sin_esta.to_csv("./Datos/df_sin_esta.csv")

In [15]:
# el siguiente paso sería unir este dataframe con nuestro dataframe original para tener todos los datos en un mismo df

df_final = pd.concat([df_copia, df_encoded], axis = 1)
df_final.head()

Unnamed: 0,index,Date/Time,Weather,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather_Mainly Clear,Weather_Mostly Cloudy
0,16,1/1/2012 16:00,Mostly Cloudy,-0.755162,-0.366931,1.320679,-0.201398,-1.722126,-1.782197,0,1
1,26,1/2/2012 2:00,Mostly Cloudy,-0.645784,-0.43015,0.616199,2.334455,-0.673565,0.024563,0,1
2,27,1/2/2012 3:00,Mostly Cloudy,-0.662611,-0.484338,0.488112,2.467921,-0.673565,0.024563,0,1
3,28,1/2/2012 4:00,Mostly Cloudy,-0.729921,-0.556589,0.488112,2.334455,-0.673565,0.024563,0,1
4,29,1/2/2012 5:00,Mostly Cloudy,-0.755162,-0.556589,0.552155,2.334455,-0.673565,0.024563,0,1


In [16]:
# por último nos queda eliminar las columnas categóricas originales porque ya no nos hacen falta. 

df_final.drop(lista_columnas, axis = 1, inplace=True)
df_final.head(2)

Unnamed: 0,index,Date/Time,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather_Mainly Clear,Weather_Mostly Cloudy
0,16,1/1/2012 16:00,-0.755162,-0.366931,1.320679,-0.201398,-1.722126,-1.782197,0,1
1,26,1/2/2012 2:00,-0.645784,-0.43015,0.616199,2.334455,-0.673565,0.024563,0,1


Continuamos codificando la variable categórica con orden  
- Date/Time (Tiene Orden)

In [17]:
df_final.shape

(4175, 10)

In [18]:
#ESTA PARTE NO ESTOY SEGURA DE QUE HAYA QUE HACERLA PORQUE AUNQUE SEA UNA CATEGORICA CON ORDEN DATE/TIME SI LA CODIFICAMOS 
# ESTAMOS OMITIENDO SU INFORMACION, A NO SER QUE SE CODIFICARA TODAS LAS FECHAS DEL DF HACIENDO UN APPLYMAP O SIMILAR PARA 
# QUE ITERARA POR LAS 4175 FILAS
# definimos el diccionario

# definimos el diccionario

#map_sex = {"male": 0, "female": 1}
#df_final["sex"] = df_final["sex"].map(map_sex)
#df_final.head(2)

Codificación con datos sin estandarizar

In [22]:
df_final.drop(["index", "Date/Time"], axis = 1, inplace=True)

In [24]:
df_final.to_csv("./Datos/df_esta.csv")