# Aclaracion

En este notebook se va a proceder con el procesamiento de los datos de nuestro conjunto, y generar la division de datos que luego usaremos en el desarrollo de nuestro modelo.

# Importacion de librerias a utilizar

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Importo el conjunto de datos procesado, y lo cargo en memoria como objeto DataFrame
dataset_route = "dataset/SeoulBikeData_procesed.csv"
dataset = pd.read_csv(dataset_route)

dataset.head(10)

Unnamed: 0.1,Unnamed: 0,Day,Month,Year,Rented Bike Count,Hour,Temperature(C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Autumn,Spring,Summer,Holiday,Functioning Day
0,0,1.0,12.0,2017.0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,0,0,0,0,0
1,1,1.0,12.0,2017.0,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,0,0,0,0,0
2,2,1.0,12.0,2017.0,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,0,0,0,0,0
3,3,1.0,12.0,2017.0,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,0,0,0,0,0
4,4,1.0,12.0,2017.0,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,0,0,0,0,0
5,5,1.0,12.0,2017.0,100,5,-6.4,37,1.5,2000,-18.7,0.0,0.0,0.0,0,0,0,0,0
6,6,1.0,12.0,2017.0,181,6,-6.6,35,1.3,2000,-19.5,0.0,0.0,0.0,0,0,0,0,0
7,7,1.0,12.0,2017.0,460,7,-7.4,38,0.9,2000,-19.3,0.0,0.0,0.0,0,0,0,0,0
8,8,1.0,12.0,2017.0,930,8,-7.6,37,1.1,2000,-19.8,0.01,0.0,0.0,0,0,0,0,0
9,9,1.0,12.0,2017.0,490,9,-6.5,27,0.5,1928,-22.4,0.23,0.0,0.0,0,0,0,0,0


In [3]:
# Elimino la primera columna del dataframe
dataset.drop(columns = ['Unnamed: 0'], inplace = True)

In [4]:
dataset.head(10)

Unnamed: 0,Day,Month,Year,Rented Bike Count,Hour,Temperature(C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Autumn,Spring,Summer,Holiday,Functioning Day
0,1.0,12.0,2017.0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,0,0,0,0,0
1,1.0,12.0,2017.0,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,0,0,0,0,0
2,1.0,12.0,2017.0,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,0,0,0,0,0
3,1.0,12.0,2017.0,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,0,0,0,0,0
4,1.0,12.0,2017.0,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,0,0,0,0,0
5,1.0,12.0,2017.0,100,5,-6.4,37,1.5,2000,-18.7,0.0,0.0,0.0,0,0,0,0,0
6,1.0,12.0,2017.0,181,6,-6.6,35,1.3,2000,-19.5,0.0,0.0,0.0,0,0,0,0,0
7,1.0,12.0,2017.0,460,7,-7.4,38,0.9,2000,-19.3,0.0,0.0,0.0,0,0,0,0,0
8,1.0,12.0,2017.0,930,8,-7.6,37,1.1,2000,-19.8,0.01,0.0,0.0,0,0,0,0,0
9,1.0,12.0,2017.0,490,9,-6.5,27,0.5,1928,-22.4,0.23,0.0,0.0,0,0,0,0,0


# Normalizacion de dataset completo

In [5]:
# El objetivo de este notebook es el de procesar el dataframe, y, tras generar los subsets de entrenamiento y testing del futuro modelo, almacenarlos en una ruta local.
# Voy a aplicar un proceso de normalizacion del conjunto de datos, haciendo uso de la clase StandardScaler de SkLearn.

# Sin embargo, voy a generar dos conjuntos normalizados. Para el primero, no normalizare las columnas respectivas a la fecha registrada para cada ejemplo; mientras que para el segundo, si llevare
# a cabo la normalizacion de estas columnas.

X = dataset.drop(columns = ['Functioning Day'])    # Incluyo todas las columnas como covariables predictoras
y = dataset['Functioning Day']

In [19]:
# Contamos con algo mas de 7000 ejemplos para el entrenamiento del modelo, y unos 1750 ejemplos para el conjunto de prueba
scaler = StandardScaler()


# Matrices de caracteristicas sin normalizar fecha ------

columns_to_exclude = ['Day', 'Month', 'Year', 'Functioning Day']
columns_to_normalize = [col for col in dataset.columns if col not in columns_to_exclude]


dataset_norm_without_date = dataset.copy()
dataset_norm_without_date[columns_to_normalize] = scaler.fit_transform(dataset_norm_without_date[columns_to_normalize])




# Matrices de caracteristicas normalizadas ------

columns_to_exclude = ['Functioning Day']
columns_to_normalize = [col for col in dataset.columns if col not in columns_to_exclude]


dataset_norm = dataset.copy()
dataset_norm[columns_to_normalize] = scaler.fit_transform(dataset_norm[columns_to_normalize])

In [23]:
# Almaceno estos subconjuntos como archivos .csv
import os

subsets_folder = 'normalized_datasets/'
if not os.path.exists(subsets_folder):
    os.mkdir(subsets_folder)

norm_subsets_folder = 'normalized_datasets/normalized_all/'
dataset_norm_route = 'normalized_datasets/normalized_all/dataset_norm.csv'
if not os.path.exists(norm_subsets_folder):
    os.mkdir(norm_subsets_folder)
dataset_norm.to_csv(dataset_norm_route)



not_norm_subsets_folder = 'normalized_datasets/normalized_without_date/'
dataset_norm_without_date_route = 'normalized_datasets/normalized_without_date/dataset_norm_without_date.csv'

if not os.path.exists(not_norm_subsets_folder):
    os.mkdir(not_norm_subsets_folder)
dataset_norm_without_date.to_csv(dataset_norm_without_date_route)

# Normalizacion dataset limitando las variables predictoras

In [29]:
# Para generar el conjunto de datos limitando la cantidad de caracteristicas, me baso en el coeficiente de correlacion de las diferentes variables con respecto a la variable dependiente.

scaler = StandardScaler()

columns_to_exclude = ['Day', 'Rented Bike Count', 'Hour', 'Wind speed (m/s)', 'Rainfall(mm)', 'Snowfall (cm)', 'Spring', 'Summer']
dataset.drop(columns = columns_to_exclude, inplace = True)
# Matrices de caracteristicas sin normalizar fecha ------

columns_to_exclude = ['Month', 'Year' ,'Functioning Day']
columns_to_normalize = [col for col in dataset.columns if col not in columns_to_exclude]

dataset_norm_without_date = dataset.copy()
dataset_norm_without_date[columns_to_normalize] = scaler.fit_transform(dataset_norm_without_date[columns_to_normalize])




# Matrices de caracteristicas normalizadas ------

columns_to_exclude = ['Functioning Day']
columns_to_normalize = [col for col in dataset.columns if col not in columns_to_exclude]


dataset_norm = dataset.copy()
dataset_norm[columns_to_normalize] = scaler.fit_transform(dataset_norm[columns_to_normalize])

In [32]:
dataset_norm

Unnamed: 0,Month,Year,Temperature(C),Humidity(%),Visibility (10m),Dew point temperature(C),Solar Radiation (MJ/m2),Autumn,Holiday,Functioning Day
0,1.587648,-3.282407,-1.513957,-1.042483,0.925871,-1.659605,-0.655132,-0.576296,-0.227757,0
1,1.587648,-3.282407,-1.539074,-0.993370,0.925871,-1.659605,-0.655132,-0.576296,-0.227757,0
2,1.587648,-3.282407,-1.580936,-0.944257,0.925871,-1.667262,-0.655132,-0.576296,-0.227757,0
3,1.587648,-3.282407,-1.597680,-0.895144,0.925871,-1.659605,-0.655132,-0.576296,-0.227757,0
4,1.587648,-3.282407,-1.580936,-1.091596,0.925871,-1.736177,-0.655132,-0.576296,-0.227757,0
...,...,...,...,...,...,...,...,...,...,...
8755,1.297612,0.304655,-0.726961,-1.189822,0.751605,-1.100630,-0.655132,1.735220,-0.227757,0
8756,1.297612,0.304655,-0.793939,-1.042483,0.925871,-1.070001,-0.655132,1.735220,-0.227757,0
8757,1.297612,0.304655,-0.860918,-0.944257,0.873263,-1.070001,-0.655132,1.735220,-0.227757,0
8758,1.297612,0.304655,-0.902779,-0.846031,0.694064,-1.062344,-0.655132,1.735220,-0.227757,0


In [33]:
norm_subsets_folder = 'normalized_datasets/cares_corr/normalized_all/'
dataset_norm_route = 'normalized_datasets/cares_corr/normalized_all/dataset_norm.csv'
if not os.path.exists(norm_subsets_folder):
    os.mkdir(norm_subsets_folder)
dataset_norm.to_csv(dataset_norm_route)



not_norm_subsets_folder = 'normalized_datasets/cares_corr/normalized_without_date/'
dataset_norm_without_date_route = 'normalized_datasets/cares_corr/normalized_without_date/dataset_norm_without_date.csv'

if not os.path.exists(not_norm_subsets_folder):
    os.mkdir(not_norm_subsets_folder)
dataset_norm_without_date.to_csv(dataset_norm_without_date_route)