In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD

%matplotlib inline

In [25]:
viajes = pd.read_csv("../Data/trip_test.csv",\
                    parse_dates=["start_date", "end_date"],\
                    infer_datetime_format=True)

In [26]:
viajes = viajes.drop(['end_station_name', 'start_station_name','bike_id','zip_code'], axis=1)

In [27]:
#Sacamos los outliers
if('duration' in viajes.columns):
    viajes = viajes.loc[viajes.duration < viajes.duration.quantile(0.99),:]

In [28]:
#Cargo el data frame de estaciones
estaciones = pd.read_csv('../Data/station.csv')
estaciones = estaciones.rename(columns={'id':'station_id'})

#Cada ciudad esta  vinculado a un zip code
zip_codes = {
    'Mountain View': 94041,
    'Redwood City': 94063,
    'San Francisco': 94107,
    'Palo Alto': 94301,
    'San Jose':95113
}

#Creo una nueva columna con zip_code de cada estacion
estaciones['zip_code'] = estaciones.city.apply(lambda c: zip_codes[c])

In [29]:
#Creamos un diccionario con las distancias euclidias entre las estaciones
import math
ids = estaciones.station_id
stm = estaciones[["station_id", "lat", "long"]]

distancias = {}
for id1 in ids:
    x = stm.loc[estaciones.station_id == id1,["lat","long"]].values
    x = (x[0][0],x[0][1])
    for id2 in ids:
        y = stm.loc[estaciones.station_id == id2,["lat","long"]].values
        y = (y[0][0],y[0][1])
        distancias[(id1,id2)] = math.sqrt((x[1]-x[0])**2 + (y[1] - y[0])**2)

In [30]:
#Agregamos una columna con la distancia entre la estacion inicial y la final
viajes["distancia"] = viajes.apply(lambda x: distancias[(x.start_station_id,x.end_station_id)],axis=1)

In [31]:
viajes_merged  = pd.merge(viajes,\
                          estaciones[['station_id', 'zip_code']],\
                          left_on='start_station_id',\
                          right_on='station_id').drop(['station_id'], axis=1)

In [32]:
weather = pd.read_csv('../Data/weather.csv',\
                      parse_dates=["date"] ,\
                      infer_datetime_format=True)

In [33]:
weather.loc[weather.events.isin(['Rain','Rain-Thunderstorm']), 'events'] = 'rain'
weather.loc[weather.events.isnull(), 'events'] = 'normal'

In [34]:
weather.precipitation_inches = pd.to_numeric(weather.precipitation_inches, errors='coerse')
weather.precipitation_inches.fillna(0,inplace=True)
weather['date'] = weather['date'].apply(lambda x: x.date())
weather['zip_code'] = weather['zip_code'].astype('int64')

In [35]:
viajes_merged["date"] = viajes_merged["start_date"].apply(lambda x: x.date())

In [36]:
data = weather.merge(viajes_merged, on=['date', 'zip_code'])

In [37]:
data['year'] = data.start_date.dt.year
data['month'] = data.start_date.dt.month
data['day'] = data.start_date.dt.day
data['week_end'] = data.start_date.dt.dayofweek > 5

data['start_hour'] = data.start_date.dt.hour
data['start_minutes']= data.start_date.dt.minute

data['end_hour'] = data.end_date.dt.hour
data['end_minutes'] = data.end_date.dt.minute

In [38]:
data = data.drop(["start_date", "end_date", "zip_code"], axis=1)

In [39]:
for col in ['events', 'subscription_type','end_station_id', 'start_station_id']:
    data[col] = data[col].astype('category')

data = data.set_index('id')
data = data.drop(['date'], axis=1)

## One-hot Encoding
Esto pasa de features categóricas a muchas binarias. No lo haremos para random forest.

In [275]:
for col in data.columns:
    if 'category' == str(data[col].dtypes):
        data = pd.concat([data, pd.get_dummies(data[col],\
                                                         prefix=col,\
                                                         drop_first=True)],\
                                                         axis=1)
        data = data.drop(col, axis=1)

In [40]:
data.dtypes

max_temperature_f                  float64
mean_temperature_f                 float64
min_temperature_f                  float64
max_dew_point_f                    float64
mean_dew_point_f                   float64
min_dew_point_f                    float64
max_humidity                       float64
mean_humidity                      float64
min_humidity                       float64
max_sea_level_pressure_inches      float64
mean_sea_level_pressure_inches     float64
min_sea_level_pressure_inches      float64
max_visibility_miles               float64
mean_visibility_miles              float64
min_visibility_miles               float64
max_wind_Speed_mph                 float64
mean_wind_speed_mph                float64
max_gust_speed_mph                 float64
precipitation_inches               float64
cloud_cover                        float64
events                            category
wind_dir_degrees                   float64
start_station_id                  category
end_station

In [23]:
data.cloud_cover.value_counts()

5.0    92150
4.0    81598
1.0    68901
6.0    67516
3.0    61704
2.0    56084
0.0    55806
7.0    44340
8.0    16351
Name: cloud_cover, dtype: int64

In [41]:
data.to_csv("../Data/test_rf.csv")

## Random Forest
Prueba de Random forest en el set de training con info categórica (sin one-hot y sin scaling!).