In [62]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

%matplotlib inline

In [63]:
pathTrips = "../Data/trip_test.csv"
dates     = ["start_date"]
cols = ['id','start_date','start_station_id','subscription_type']

viajes = pd.read_csv(pathTrips,\
                     parse_dates=dates,\
                     usecols=cols,
                     infer_datetime_format=True)

In [64]:
#Sacamos los outliers
if('duration' in viajes.columns):
     viajes = viajes.loc[viajes.duration < viajes.duration.quantile(0.99),:]

In [65]:
#Cargo el data frame de estaciones
cols = ['id', 'city']
estaciones = pd.read_csv('../Data/station.csv',\
                         usecols=cols).rename(columns={'id':'station_id'})

#Cada ciudad esta  vinculado a un zip code
zip_codes = {
    'Mountain View': 94041,
    'Redwood City': 94063,
    'San Francisco': 94107,
    'Palo Alto': 94301,
    'San Jose':95113
}

#Creo una nueva columna con zip_code de cada estacion
estaciones['zip_code'] = estaciones.city.apply(lambda c: zip_codes[c])

In [66]:
viajes  = pd.merge(viajes,\
                   estaciones[['station_id', 'zip_code']],\
                   left_on='start_station_id',\
                   right_on='station_id').drop(['station_id'], axis=1)

In [67]:
weather = pd.read_csv('../Data/weather.csv',\
                      parse_dates=["date"] ,\
                      infer_datetime_format=True)

In [68]:
weather.loc[weather.events.isin(['rain']), 'events'] = 'Rain'
weather.loc[weather.events.isnull(), 'events'] = 'normal'

In [69]:
weather.precipitation_inches = pd.to_numeric(weather.precipitation_inches, errors='coerse')
weather.precipitation_inches.fillna(0,inplace=True)
weather['date'] = weather['date'].apply(lambda x: x.date())
weather['zip_code'] = weather['zip_code'].astype('int64')

In [70]:
viajes["date"] = viajes["start_date"].apply(lambda x: x.date())

In [71]:
data = weather.merge(viajes, on=['date', 'zip_code'])

In [72]:
#Agrego una dimension con la estacion del año
data['season'] = data.start_date.apply(lambda dt: (dt.month%12 + 3)//3)

#Agrego una dimension con la hora que se aquilo la bicicleta
data['hour'] = data.start_date.dt.hour

#Agrego una dimension que diferencia los alquileres en base a si es o no fin de semana
data['week_end'] = data.start_date.dt.day > 5

In [73]:
data = data.drop(["start_date", "date"], axis=1)

In [74]:
for col in ['events', 'subscription_type', "zip_code", "season", 'start_station_id']:
    data[col] = data[col].astype('category')

In [75]:
data = data.set_index('id')

In [76]:
for col in data.columns:
    if 'category' == str(data[col].dtypes):
        data = pd.concat([data, pd.get_dummies(data[col],prefix=col)],axis=1).drop(col, axis=1)

In [77]:
data.to_csv("../Data/test.csv")