In [57]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD

%matplotlib inline

In [58]:
# Cargo el data frame de estaciones
estaciones = pd.read_csv('../data/station.csv')
estaciones = estaciones.rename(columns={'id':'station_id'})

# Cada ciudad esta  vinculado a un zip code
zip_codes = {
    'Mountain View': 94041,
    'Redwood City': 94063,
    'San Francisco': 94107,
    'Palo Alto': 94301,
    'San Jose':95113
}

# Creo una nueva columna con zip_code de cada estacion
estaciones['zip_code'] = estaciones.city.apply(lambda c: zip_codes[c])

In [59]:
def outliers(viajes):
    #Sacamos los outliers
    if('duration' in viajes.columns):
        return viajes.loc[viajes.duration < viajes.duration.quantile(0.99),:]
    else:
        return viajes

In [60]:
def add_weather(viajes, soloLluvia = True):
    # Merge provisorio con stations para obtener el zip_code
    viajes_merged  = pd.merge(viajes,\
                              estaciones[['station_id', 'zip_code']],\
                              left_on='start_station_id',\
                              right_on='station_id').drop(['station_id'], axis=1)
    weather = pd.read_csv('../data/weather.csv',\
                      parse_dates=["date"] ,\
                      infer_datetime_format=True)
    
    # Events. Ver si da mejor resultado con o sin unir los dos tipos de lluvia.
    if soloLluvia:
        weather.loc[weather.events.isin(['Rain','Rain-Thunderstorm']), 'events'] = 'rain'
        weather.loc[weather.events.isnull(), 'events'] = 'normal'
    
    weather.precipitation_inches = pd.to_numeric(weather.precipitation_inches, errors='coerse')
    weather.precipitation_inches.fillna(0,inplace=True)
    weather['date'] = weather['date'].apply(lambda x: x.date())
    weather['zip_code'] = weather['zip_code'].astype('int64')
    viajes_merged["date"] = viajes_merged["start_date"].apply(lambda x: x.date())
    return weather.merge(viajes_merged, on=['date', 'zip_code']).drop("zip_code",axis=1)

In [67]:
def separarFecha(data):
    data['year'] = data.start_date.dt.year
    data['month'] = data.start_date.dt.month
    data['day'] = data.start_date.dt.day
    data['week_end'] = data.start_date.dt.dayofweek > 5
    data['start_hour'] = data.start_date.dt.hour
    data['start_minutes']= data.start_date.dt.minute
    return data.drop("start_date", axis=1)

In [62]:
def categorias(data):
    for col in ['events', 'subscription_type', 'start_station_id']:
        data[col] = data[col].astype('category')

    data = data.set_index('id')
    data = data.drop(['date'], axis=1)
    # One-hot-encoding: ver la alternativa de sklearn.
    for col in data.columns:
        if 'category' == str(data[col].dtypes):
            data = pd.concat([data, pd.get_dummies(data[col],\
                                                             prefix=col,\
                                                             drop_first=True)],\
                                                             axis=1)
            data = data.drop(col, axis=1)
    return data

In [63]:
def preproc(archViajes):
    data = pd.read_csv(archViajes,\
                    parse_dates=["start_date", "end_date"],\
                    infer_datetime_format=True)
    data.drop(['end_station_name', 'end_station_id', 'end_date', 'start_station_name','bike_id', 'zip_code'], \
                axis=1, inplace=True)
    data = outliers(data)
    data = add_weather(data)
    data = separarFecha(data)
    data = categorias(data)
    data.to_csv(archViajes.replace("trip","data"))

In [64]:
arch_train = "../data/trip_train.csv"
arch_test = "../data/trip_test.csv"

In [68]:
preproc(arch_train)
preproc(arch_test)