# Predecir demanda

## Parámetros

In [1]:
from pathlib import Path

BASE_DIR = Path('/Users/efraflores/Desktop/EF/Diplo/data/Bren')
TRAIN_NAME = 'train.csv'
VALID_NAME = 'valid.csv'

## Funciones

In [None]:
import pandas as pd

def read_chunks(path_dir, file_name, chunk_size=1e5):
    chunks = pd.read_csv(path_dir.joinpath(file_name), chunksize=chunk_size)

    data = pd.DataFrame()
    for chunk in chunks:
        try:
            chunk['hora_salida'] = pd.to_datetime(chunk['hora_salida'])
        except:
            return chunk
        chunk['num_viajes'] = 1
        chunk['year_week'] = chunk['hora_salida'].dt.year.astype(str)+'_'+chunk['hora_salida'].dt.isocalendar().week.astype(str).str.zfill(2)
        grouped = chunk.pivot_table(index=['id_salida','year_week'], values=['num_viajes','num_pasajeros','monto_viaje'], aggfunc=sum)
        data = data.append(grouped.reset_index().reset_index(drop=True), ignore_index=True)

    final = data.pivot_table(index=['id_salida','year_week'], values=['num_viajes','num_pasajeros','monto_viaje'], aggfunc=sum)
    final = final.reset_index().sort_values(['id_salida','year_week'])
    return final

## Importar

In [3]:
df = read_chunks(BASE_DIR, TRAIN_NAME)
print(df.shape)
df.head()

(4584, 5)


Unnamed: 0,id_salida,year_week,monto_viaje,num_pasajeros,num_viajes
0,2,2018_01,23.3,1,1
1,2,2018_02,54.45,2,2
2,2,2018_03,70.26,1,1
3,2,2018_05,85.07,2,2
4,2,2018_06,8.19,6,1


In [4]:
val = read_chunks(BASE_DIR, VALID_NAME)
print(val.shape)
val.head()

(1888, 5)


Unnamed: 0,id_salida,year_week,monto_viaje,num_pasajeros,num_viajes
0,1,2018_01,15461.29,276,174
1,1,2018_02,10413.18,185,119
2,1,2018_03,10179.49,179,115
3,1,2018_04,10782.59,176,119
4,1,2018_05,9778.28,151,112


## Transformar

### Guardar agrupado

In [6]:
TRAIN_NAME_SUB = ''.join(TRAIN_NAME.split('.')[:-1])
df.to_csv(BASE_DIR.joinpath(f"{TRAIN_NAME_SUB}_grouped.csv"), index=False)
VALID_NAME_SUB = ''.join(VALID_NAME.split('.')[:-1])
val.to_csv(BASE_DIR.joinpath(f"{VALID_NAME_SUB}_grouped.csv"), index=False)

### Semana en columnas

In [10]:
df = df.pivot_table(index='id_salida', columns='year_week', values='num_viajes', aggfunc='sum', fill_value=0)
df.head()

year_week,2018_01,2018_02,2018_03,2018_04,2018_05,2018_06,2018_07,2018_08,2018_09,2018_10,...,2018_17,2018_18,2018_19,2018_20,2018_21,2018_22,2018_23,2018_24,2018_25,2018_26
id_salida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1.0,2.0,1.0,,2.0,1.0,1.0,2.0,2.0,6.0,...,,,,2.0,2.0,1.0,,4.0,,1.0
4,4274.0,4782.0,4550.0,4672.0,4936.0,4647.0,4742.0,4643.0,8426.0,9672.0,...,4448.0,4895.0,4428.0,4761.0,3787.0,4368.0,4102.0,3970.0,4450.0,3380.0
5,1.0,1.0,,,,1.0,2.0,,2.0,2.0,...,1.0,,,,,,1.0,1.0,,1.0
7,3772.0,3323.0,3281.0,3218.0,3321.0,3110.0,3741.0,3529.0,5871.0,7040.0,...,3130.0,3859.0,3236.0,3300.0,3237.0,3314.0,3331.0,3398.0,3073.0,2667.0
8,21.0,8.0,19.0,13.0,17.0,14.0,19.0,13.0,29.0,40.0,...,18.0,21.0,22.0,14.0,19.0,25.0,26.0,33.0,25.0,17.0


In [12]:
val = val.pivot_table(index='id_salida', columns='year_week', values='num_viajes', aggfunc='sum', fill_value=0)
val.head()

year_week,2018_01,2018_02,2018_03,2018_04,2018_05,2018_06,2018_07,2018_08,2018_09,2018_10,...,2018_16,2018_17,2018_18,2018_19,2018_20,2018_21,2018_22,2018_23,2018_24,2018_25
id_salida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,174,119,115,119,112,104,150,118,273,254,...,180,182,190,175,178,152,145,147,145,154
3,8,6,6,14,10,6,13,17,30,22,...,21,23,22,27,24,16,20,14,25,19
6,6,13,20,9,7,7,8,2,14,26,...,5,4,4,6,8,9,5,6,4,6
9,16,10,5,10,13,11,9,6,9,16,...,16,15,10,12,10,16,9,15,13,11
25,1793,2427,1776,1783,2028,2040,2165,1901,3680,5078,...,2486,2256,2471,2253,2528,2064,2176,2265,2449,2258


## Modelo

### Pre-procesamiento

In [14]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X = df.iloc[:,:-1].copy()
y = df.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    train_size=0.77,
                                                    random_state=22)
mm_x = MinMaxScaler()

### Arquitectura

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()

model_reg = Pipeline(steps=[('scaler', mm_x),
                            ('model', linear_reg)])

print(f"Score: {'{:.2%}'.format(model_reg.fit(X_train,y_train).score(X_test,y_test))}")
print(f"Training score: {'{:.2%}'.format(model_reg.score(X_train,y_train))}")
print("\nThese are the most influential variables:")
coef = pd.DataFrame(zip(X.columns,model_reg[1].coef_)).sort_values(1,0,0).reset_index(drop=True)
coef.head().append(coef.tail())

Score: 99.70%
Training score: 100.00%

These are the most influential variables:


Unnamed: 0,0,1
0,2018_05,49387.38823
1,2018_19,37752.958562
2,2018_25,34693.596771
3,2018_17,31310.853777
4,2018_11,27252.025754
20,2018_18,-29501.964114
21,2018_20,-29732.272387
22,2018_15,-42360.629071
23,2018_03,-51004.818882
24,2018_09,-55167.974307


## Predicción

In [30]:
import numpy as np
val['2018_26'] = np.clip(model_reg.predict(val), 0, 1e10).round()
val.head()

year_week,2018_01,2018_02,2018_03,2018_04,2018_05,2018_06,2018_07,2018_08,2018_09,2018_10,...,2018_17,2018_18,2018_19,2018_20,2018_21,2018_22,2018_23,2018_24,2018_25,2021_26
id_salida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,174,119,115,119,112,104,150,118,273,254,...,182,190,175,178,152,145,147,145,154,91.0
3,8,6,6,14,10,6,13,17,30,22,...,23,22,27,24,16,20,14,25,19,19.0
6,6,13,20,9,7,7,8,2,14,26,...,4,4,6,8,9,5,6,4,6,0.0
9,16,10,5,10,13,11,9,6,9,16,...,15,10,12,10,16,9,15,13,11,19.0
25,1793,2427,1776,1783,2028,2040,2165,1901,3680,5078,...,2256,2471,2253,2528,2064,2176,2265,2449,2258,2000.0


In [39]:
val[['2018_26']].to_csv(BASE_DIR.joinpath(f"Bren_{VALID_NAME_SUB}_final.csv"))