# Limpieza

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sidetable # si os da un error este comando es por no tenerlo instalado, 
                 # vemos cómo instalarlo en el siguiente apartado

# establecer el tamaño de nuestras figuras
plt.rcParams["figure.figsize"] = (10,8)

In [2]:
df = pd.read_csv('data/bikes.csv', index_col = 0)

In [3]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,spring,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,spring,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,spring,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,spring,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,spring,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730 entries, 0 to 729
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     730 non-null    int64  
 1   dteday      730 non-null    object 
 2   season      730 non-null    object 
 3   yr          730 non-null    int64  
 4   mnth        730 non-null    int64  
 5   holiday     730 non-null    int64  
 6   weekday     730 non-null    int64  
 7   workingday  730 non-null    int64  
 8   weathersit  730 non-null    int64  
 9   temp        730 non-null    float64
 10  atemp       730 non-null    float64
 11  hum         730 non-null    float64
 12  windspeed   730 non-null    float64
 13  casual      730 non-null    int64  
 14  registered  730 non-null    int64  
 15  cnt         730 non-null    int64  
dtypes: float64(4), int64(10), object(2)
memory usage: 97.0+ KB


In [5]:
df.isnull().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

In [6]:
df.duplicated().sum()

0

### Renombramos las columnas

Nuestro df contiene las siguientes columnas:
- registro = (instant), es el índice
- fecha = (dteday)
- estacion = (season) Hay que cambiarla, algunas estaciones no coinciden con la fecha.
- año = (year) También hay que cambiarla, está en 0  y 1 (2018,2019)
- mes = (month) 
- festivo = (holiday) 0: laborales, 1:festivos, creemos que tomará festivos.
- dia_semana = (weekday) 
- no_laboral = (workingday) Cuenta como 0 el laboral y el 1 el no laboral
- clima = ('weathersit')
- 'temperatura' =  (temp)
- 'sens_termica'=  ('atemp')
- humedad = (hum)
- viento = (windspeed)
- ocasionales = (casual)
- registrados = (registred)
- total = (cnt)


In [7]:
df.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [8]:
dic_columnas = {'instant': 'registro', 'dteday': 'fecha', 'season': 'estacion', 'yr': 'año', 'mnth':'mes', 'holiday':'festivo', 'weekday':'dia_semana',
       'workingday':'no_laboral', 'weathersit':'clima', 'temp':'temperatura', 'atemp':'sens_termica', 'hum':'humedad', 'windspeed':'viento', 
       'casual':'ocasionales', 'registered':'registrados', 'cnt':'total'}

In [9]:
df.rename(columns = dic_columnas, inplace = True)

In [10]:
df.head(2)

Unnamed: 0,registro,fecha,estacion,año,mes,festivo,dia_semana,no_laboral,clima,temperatura,sens_termica,humedad,viento,ocasionales,registrados,total
0,1,01-01-2018,spring,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,spring,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801


In [11]:
df['festivo'].value_counts()

0    709
1     21
Name: festivo, dtype: int64

In [12]:
festivos = df[df['festivo'] == 1]
festivos.head(21)

Unnamed: 0,registro,fecha,estacion,año,mes,festivo,dia_semana,no_laboral,clima,temperatura,sens_termica,humedad,viento,ocasionales,registrados,total
16,17,17-01-2018,spring,0,1,1,1,0,2,7.209153,8.83855,53.75,12.999139,117,883,1000
51,52,21-02-2018,spring,0,2,1,1,0,2,12.436653,14.20375,60.5,20.625682,195,912,1107
104,105,15-04-2018,summer,0,4,1,5,0,1,18.313347,22.09565,67.125,15.167125,642,2484,3126
149,150,30-05-2018,summer,0,5,1,1,0,1,30.066653,33.5546,68.5,8.792075,1549,2549,4098
184,185,04-07-2018,autumn,0,7,1,1,0,2,29.793347,33.27085,63.7917,5.459106,3065,2978,6043
247,248,05-09-2018,autumn,0,9,1,1,0,2,27.606653,31.2823,79.0417,14.250632,1236,2115,3351
282,283,10-10-2018,winter,0,10,1,1,0,1,23.404153,27.14625,73.375,2.834381,1514,3603,5117
314,315,11-11-2018,winter,0,11,1,5,0,1,13.290847,15.34085,44.625,21.083225,440,2928,3368
327,328,24-11-2018,winter,0,11,1,4,0,1,15.306653,18.62355,54.9167,11.209368,560,935,1495
359,360,26-12-2018,spring,0,12,1,1,0,1,13.191299,15.77675,50.6957,16.044155,430,887,1317


#### Cambiamos la columna año, mes y día_semana

In [13]:
df['dia_semana'].unique()

array([6, 0, 1, 2, 3, 4, 5])

Cambiamos el formato a la columna fecha

In [14]:
df['fecha_nueva'] = pd.to_datetime(df["fecha"], format="%d-%m-%Y")

In [15]:
#!pip install holidays
import holidays

In [16]:
df['festividad'] = pd.Series(df.fecha_nueva).apply(lambda x: holidays.CountryHoliday('US').get(x)).values

In [17]:
df['festividad_1'] = df['festividad'].astype('bool').astype('int')

In [18]:
df['festividad_1'].value_counts()

0    709
1     21
Name: festividad_1, dtype: int64

In [19]:
df.head(2)

Unnamed: 0,registro,fecha,estacion,año,mes,festivo,dia_semana,no_laboral,clima,temperatura,sens_termica,humedad,viento,ocasionales,registrados,total,fecha_nueva,festividad,festividad_1
0,1,01-01-2018,spring,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985,2018-01-01,New Year's Day,1
1,2,02-01-2018,spring,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801,2018-01-02,,0


Cambiamos las estaciones

Primero renombramos:

In [21]:
from datetime import date, datetime

In [28]:
#df['año_bueno'] = df['fecha_nueva'].apply(datetime.date.today().year)

df['año_bueno'] = pd.DatetimeIndex(df['fecha_nueva']).year 

In [29]:
df['año_bueno'].value_counts()

2018    365
2019    365
Name: año_bueno, dtype: int64

In [20]:
year = df['año'].unique()

for x in year:
    seasons = [('winter', (date(x,  1,  1),  date(x,  3, 20))),
            ('spring', (date(x,  3, 21),  date(x,  6, 20))),
            ('summer', (date(x,  6, 21),  date(x,  9, 22))),
            ('autumn', (date(x,  9, 23),  date(x, 12, 20))),
            ('winter', (date(x, 12, 21),  date(x, 12, 31)))]

def get_season(now):
    if isinstance(now, datetime):
        now = now.date()
    now = now.replace(year=year)
    return next(season for season, (start, end) in seasons
                if start <= now <= end)

print(get_season(date.today()))

ValueError: year 0 is out of range