In [1]:
import pandas as pd
import os
import regex as re
import time

import plotly
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")




In [2]:
df = pd.DataFrame(columns=['year', 'month', 'weekend' , 'searchby', 'origin', 'destiny', 'mean_time_s', 'std_time_s'])

lista_datos = os.listdir("data/Q-MonthlyAggregate/")

for f in lista_datos :
    data = pd.read_csv(f"data/Q-MonthlyAggregate/{f}")
    data["year"]  = int(re.findall(r"\d{4}",f)[0])
    if   "Weekdays"         in f : data["weekend"] = 0 # weekday=0   weekend=1
    elif "Weekends"         in f : data["weekend"] = 1
    
    if   "barrios"          in f : data["searchby"] = 0 # zip_code=1  neighborhood=0
    elif "codigos_postales" in f : data["searchby"] = 1 
        
    data.drop(columns=["geometric_mean_travel_time","geometric_standard_deviation_travel_time"],inplace=True)
    data = data.rename(columns={'mean_travel_time'              : 'mean_time_s',
                                'standard_deviation_travel_time': 'std_time_s' ,
                                'sourceid'                      : 'origin'     ,
                                'dstid'                         : 'destiny'    })    
    data[['mean_time_s', 'std_time_s']].astype(int)
    
    df = pd.concat([df, data])

df['year'       ] = df['year'       ].astype(int)
df['month'      ] = df['month'      ].astype(int)
df['weekend'    ] = df['weekend'    ].astype(bool)
df['searchby'   ] = df['searchby'   ].astype(bool)
df['origin'     ] = df['origin'     ].astype(int)
df['destiny'    ] = df['destiny'    ].astype(int)
df['mean_time_s'] = df['mean_time_s'].astype(int)
df['std_time_s' ] = df['std_time_s' ].astype(int)
df['mean_time_m'] = df['mean_time_s']/60
df['std_time_m' ] = df['std_time_s' ]/60
df = df.reset_index(drop=True)
df

Unnamed: 0,year,month,weekend,searchby,origin,destiny,mean_time_s,std_time_s,mean_time_m,std_time_m
0,2017,3,False,False,17,43,1107,446,18.450000,7.433333
1,2017,3,False,False,49,24,1127,519,18.783333,8.650000
2,2017,3,False,False,42,94,1028,677,17.133333,11.283333
3,2017,2,False,False,4,129,1298,468,21.633333,7.800000
4,2017,3,False,False,48,34,318,261,5.300000,4.350000
...,...,...,...,...,...,...,...,...,...,...
1694311,2020,3,True,True,240,2,1232,441,20.533333,7.350000
1694312,2020,2,True,True,2,14,673,237,11.216667,3.950000
1694313,2020,2,True,True,1,24,1248,310,20.800000,5.166667
1694314,2020,3,True,True,22,267,1164,199,19.400000,3.316667


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1694316 entries, 0 to 1694315
Data columns (total 10 columns):
 #   Column       Dtype  
---  ------       -----  
 0   year         int64  
 1   month        int64  
 2   weekend      bool   
 3   searchby     bool   
 4   origin       int64  
 5   destiny      int64  
 6   mean_time_s  int64  
 7   std_time_s   int64  
 8   mean_time_m  float64
 9   std_time_m   float64
dtypes: bool(2), float64(2), int64(6)
memory usage: 106.6 MB


In [4]:
df.describe()

Unnamed: 0,year,month,origin,destiny,mean_time_s,std_time_s,mean_time_m,std_time_m
count,1694316.0,1694316.0,1694316.0,1694316.0,1694316.0,1694316.0,1694316.0,1694316.0
mean,2018.328,6.334392,77.44926,79.82954,873.5429,380.6274,14.55905,6.343789
std,0.9241161,3.55732,72.4465,73.66253,383.119,153.6555,6.385316,2.560925
min,2017.0,1.0,1.0,1.0,33.0,3.0,0.55,0.05
25%,2018.0,3.0,26.0,27.0,595.0,282.0,9.916667,4.7
50%,2018.0,6.0,55.0,57.0,864.0,357.0,14.4,5.95
75%,2019.0,9.0,101.0,104.0,1133.0,448.0,18.88333,7.466667
max,2020.0,12.0,299.0,299.0,3731.0,3346.0,62.18333,55.76667


In [None]:
df.to_csv("data/All_joined.csv")

In [None]:
#Lo exporto a pickle para leerlo desde streamlit
df.to_pickle("data/All_joined.pkl")

# Exploratory Data Analysis

In [None]:
# Set up the matplotlib figure
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 6), sharey=True)
f.suptitle("Average Travel Time between Origin and Destiny")

sns.barplot( data=df, x="weekend", y="mean_time_m", ci="sd", palette="deep", alpha=.6 , ax=ax1 )
ax1.set_xlabel("Weekend")
ax1.set_ylabel("Mean Time (Minutes)")

# Center the data to make it diverging
sns.barplot( data=df, x="year", y="mean_time_m", ci="sd", palette="dark", alpha=.6 , ax=ax2)
ax2.set_xlabel("Year")
ax2.set_ylabel("")

sns.barplot( data=df, x="year", y="mean_time_m", hue="weekend", ci="sd", palette="deep", alpha=.6, ax=ax3)
ax3.set_xlabel("Year")

# Finalize the plot
sns.despine(bottom=True)


In [None]:
g = sns.catplot( 
                data=df, kind="bar",
                x="month", 
                y="mean_time_m", 
                ci="sd", palette="dark", alpha=.6,
                height=5, aspect=16/9                
               );
g.despine(left=True);
g.set_axis_labels("Month", "Mean time [Minutes]");
g.set(title="Average Travel Time between Origin and Destiny");

In [None]:
g = sns.catplot( 
                data=df, kind="bar",
                x="month", 
                y="mean_time_m", 
                hue="weekend",
                ci="sd", palette="dark", alpha=.6,
                height=5, aspect=16/9                
               );
g.despine(left=True)
g.set_axis_labels("Month", "Mean time [Minutes]");
g.set(title="Average Travel Time between Origin and Destiny");

In [None]:
g = sns.catplot( 
                data=df, kind="bar",
                x="month", 
                y="mean_time_m", 
                hue="year",
                ci="sd", palette="dark", alpha=.6,
                height=5, aspect=16/9                
               );
g.despine(left=True);
g.set_axis_labels("Month", "Mean time [Minutes]");
g.set(title="Average Travel Time between Origin and Destiny");
g.legend.set_title("");

In [None]:
g = sns.catplot( 
                data=df, kind="bar",
                x="year", 
                y="mean_time_m", 
                hue="month",
                ci="sd", palette="dark", alpha=.6,
                height=5, aspect=16/9                
               );
g.despine(left=True);
g.set_axis_labels("Year", "Mean time [Minutes]");
g.set(title="Average Travel Time between Origin and Destiny");

Como lo vamos a hacer con plotly y luego lo visualizamos en streamlit: lo hago aquí para luego pasarlo a streamlit https://plotly.com/python/line-charts/

In [None]:
# Parametrización para hacer gráficos:
# de momento manual, el objetivo es que sea a traves de streamlit

ano       = 2019
mes       = 1
finde     = True  # weekend=True   weekday=False
buscarpor = True  # zip_code=True  neighborhood=False
origen    = 1
destino   = 2


In [None]:
newdf = df[
#            (df.year     == ano       ) & 
#            (df.month    == mes       ) &
            (df.weekend  == finde     ) &
            (df.searchby == buscarpor ) &
            (df.origin   == origen    ) &
            (df.destiny  == destino   ) 
          ]
newdf = newdf.reset_index(drop=True)

In [None]:
newdf

In [None]:
monthdf = df.groupby("month").agg({"mean_time_m":'mean'})

fig = px.bar(monthdf, y="mean_time_m", 
             title=f'Average travel time from Origin to Destiny',
             labels={"index": "month"})
fig.show()

In [None]:
monthdf
