In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime

In [2]:
df = pd.read_csv('datos/bikes.csv', index_col=0)

In [3]:
df.head(2)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,spring,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,spring,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801


- instant: record index
- dteday : date
- season : season (spring, summer, autumn, winter)
- yr : year (0: 2018, 1:2019)
- mnth : month ( 1 to 12)
- holiday : weather day is a holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
+ weathersit : 
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
    - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : temperature in Celsius
- atemp: feeling temperature in Celsius
- hum: humidity
- windspeed: wind speed
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered

In [4]:
df.shape

(730, 16)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730 entries, 0 to 729
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     730 non-null    int64  
 1   dteday      730 non-null    object 
 2   season      730 non-null    object 
 3   yr          730 non-null    int64  
 4   mnth        730 non-null    int64  
 5   holiday     730 non-null    int64  
 6   weekday     730 non-null    int64  
 7   workingday  730 non-null    int64  
 8   weathersit  730 non-null    int64  
 9   temp        730 non-null    float64
 10  atemp       730 non-null    float64
 11  hum         730 non-null    float64
 12  windspeed   730 non-null    float64
 13  casual      730 non-null    int64  
 14  registered  730 non-null    int64  
 15  cnt         730 non-null    int64  
dtypes: float64(4), int64(10), object(2)
memory usage: 97.0+ KB


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
instant,730.0,365.5,210.877136,1.0,183.25,365.5,547.75,730.0
yr,730.0,0.5,0.500343,0.0,0.0,0.5,1.0,1.0
mnth,730.0,6.526027,3.450215,1.0,4.0,7.0,10.0,12.0
holiday,730.0,0.028767,0.167266,0.0,0.0,0.0,0.0,1.0
weekday,730.0,2.99726,2.006161,0.0,1.0,3.0,5.0,6.0
workingday,730.0,0.683562,0.465405,0.0,0.0,1.0,1.0,1.0
weathersit,730.0,1.394521,0.544807,1.0,1.0,1.0,2.0,3.0
temp,730.0,20.319259,7.506729,2.424346,13.811885,20.465826,26.880615,35.328347
atemp,730.0,23.726322,8.150308,3.95348,16.889713,24.368225,30.445775,42.0448
hum,730.0,62.765175,14.237589,0.0,52.0,62.625,72.989575,97.25


In [7]:
df.duplicated().sum()

0

In [8]:
df_cat = df.select_dtypes(include='object')
df_cat.head()

Unnamed: 0,dteday,season
0,01-01-2018,spring
1,02-01-2018,spring
2,03-01-2018,spring
3,04-01-2018,spring
4,05-01-2018,spring


In [9]:
df_num = df.select_dtypes(include=np.number)
df_num.head()

Unnamed: 0,instant,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [10]:
# creamos un diccionario para renombrar las columnas

dicc_columnas = {'instant': 'indice',
'dteday' : 'fecha',
'season' : 'estacion',
'yr' : 'año',
'mnth' : 'mes',
'holiday' : 'festivo', 
'weekday' : 'dia_semana',
'workingday' : 'dia_laboral',
'weathersit' : 'tiempo',
'temp' : 'temperatura',
'atemp': 'sensacion_term',
'hum': 'humedad',
'windspeed': 'velocidad_viento',
'casual': 'cliente_casual',
'registered': 'cliente_registrado',
'cnt': 'total_clientes'}

In [11]:
# renombramos las columnas segun el diccionario 

df.rename(columns = dicc_columnas, inplace=True)

In [32]:
# ponemos la nueva columna 'index' como indice

df.set_index('indice', inplace=True)

In [31]:
df.head(2)

Unnamed: 0,indice,fecha,estacion,año,mes,festivo,dia_semana,dia_laboral,tiempo,temperatura,sensacion_term,humedad,velocidad_viento,cliente_casual,cliente_registrado,total_clientes
0,1,2018-01-01,,2018,January,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,2018-02-01,,2018,February,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801


In [14]:
# inspeccionamos los tipos de datos de las columnas
df.dtypes

indice                  int64
fecha                  object
estacion               object
año                     int64
mes                     int64
festivo                 int64
dia_semana              int64
dia_laboral             int64
tiempo                  int64
temperatura           float64
sensacion_term        float64
humedad               float64
velocidad_viento      float64
cliente_casual          int64
cliente_registrado      int64
total_clientes          int64
dtype: object

In [30]:
# convertimos la columna fecha a tipo datetime

df['fecha'] = df['fecha'].apply(pd.to_datetime)

In [16]:
# funcion para sacar el año con un apply

def sacar_año(x):
    return x.strftime('%Y')

In [17]:
# funcion para sacar el mes con un apply

def sacar_mes(x):
    return x.strftime('%B')

In [25]:
# sobreescribimos la columna del año sacando el año directamente de la fecha

df['año'] = df['fecha'].apply(sacar_año)

In [26]:
# sobreescribimos la columna de meses sacando el mes directamente de la fecha

df['mes'] = df['fecha'].apply(sacar_mes)

In [28]:
def estacion(x):
    año = x.year()
    if x <= f'':
        return 'invierno'
    elif x <= '%Y-06-20':
        return 'primavera'
    elif x <= '%Y-09-20':
        return 'verano'
    elif x <= '%Y-12-20':
        return 'otoño'

In [None]:
estaciones = {'invierno' : (start='21/3/', end='1/08/2018')}

In [29]:
df['estacion'] = df['fecha'].apply(estacion)

TypeError: '<=' not supported between instances of 'Timestamp' and 'str'

In [27]:
df.dtypes

indice                         int64
fecha                 datetime64[ns]
estacion                      object
año                           object
mes                           object
festivo                        int64
dia_semana                     int64
dia_laboral                    int64
tiempo                         int64
temperatura                  float64
sensacion_term               float64
humedad                      float64
velocidad_viento             float64
cliente_casual                 int64
cliente_registrado             int64
total_clientes                 int64
dtype: object

In [22]:
df.head()

Unnamed: 0,indice,fecha,estacion,año,mes,festivo,dia_semana,dia_laboral,tiempo,temperatura,sensacion_term,humedad,velocidad_viento,cliente_casual,cliente_registrado,total_clientes
0,1,01-01-2018,,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600
