In [1]:
import pandas as pd
from pathlib import Path

In [2]:
influx_path = Path("Outputs/influx.csv")
stations_path = Path("Outputs/stations.csv")
dates_path = Path('Outputs/dates.csv')

In [3]:
influx_df = pd.read_csv(influx_path)
stations_df = pd.read_csv(stations_path)
dates_df = pd.read_csv(dates_path)

In [4]:
influx_df = influx_df[['date', 'station', 'line', 'influx']]
influx_df.head()

Unnamed: 0,date,station,line,influx
0,2010-01-01,Zaragoza,Línea 1,20227
1,2010-01-01,Isabel la Católica,Línea 1,6487
2,2010-01-01,Moctezuma,Línea 1,10304
3,2010-01-01,Pino Suárez,Línea 1,8679
4,2010-01-01,Gómez Farías,Línea 1,19499


In [5]:
stations_df = stations_df[['station', 'district', 'ints', 'Lat', 'Long']]
stations_df.head()

Unnamed: 0,station,district,ints,Lat,Long
0,Zaragoza,Venustiano Carranza,1,19.41309,-99.082348
1,Isabel la Católica,Cuauhtémoc,1,19.427258,-99.137948
2,Moctezuma,Venustiano Carranza,1,19.427017,-99.109746
3,Pino Suárez,Cuauhtémoc,2,19.424618,-99.13296
4,Gómez Farías,Venustiano Carranza,1,19.416672,-99.090221


In [6]:
dates_df = dates_df[['date', 'day', 'month', 'year', 'weekday', 'day_off',
                     'season', 'summer_vacations']]
dates_df.head()

Unnamed: 0,date,day,month,year,weekday,day_off,season,summer_vacations
0,2010-01-01,1,Enero,2010,4,True,winter,False
1,2010-01-02,2,Enero,2010,5,False,winter,False
2,2010-01-03,3,Enero,2010,6,False,winter,False
3,2010-01-04,4,Enero,2010,0,False,winter,False
4,2010-01-05,5,Enero,2010,1,False,winter,False


In [7]:
influx_df = pd.merge(influx_df, stations_df, on="station", how="left")

In [8]:
influx_df = pd.merge(influx_df, dates_df, on="date", how="left")

In [9]:
influx_df.tail()

Unnamed: 0,date,station,line,influx,district,ints,Lat,Long,day,month,year,weekday,day_off,season,summer_vacations
973240,2023-08-31,Romero Rubio,Línea B,14861,Venustiano Carranza,1,19.440864,-99.094576,31,Agosto,2023,3,False,summer,False
973241,2023-08-31,Río de los Remedios,Línea B,15842,Gustavo A. Madero,1,19.490807,-99.046518,31,Agosto,2023,3,False,summer,False
973242,2023-08-31,San Lázaro,Línea B,24833,Cuauhtémoc,2,19.432183,-99.113614,31,Agosto,2023,3,False,summer,False
973243,2023-08-31,Tepito,Línea B,19832,Cuauhtémémoc,1,19.443016,-99.12417,31,Agosto,2023,3,False,summer,False
973244,2023-08-31,Villa de Aragón,Línea B,15716,Gustavo A. Madero,1,19.461826,-99.061329,31,Agosto,2023,3,False,summer,False


In [10]:
influx_df = influx_df[influx_df['station'].isin(
    ['Zócalo/Tenochtitlan', 'Pino Suárez', 'Allende', 'Bellas Artes', 'Hidalgo'])].copy()

In [11]:
influx_df = influx_df.drop(influx_df[influx_df['influx'] == 0].index)

In [12]:
influx_df.reset_index()
influx_df.tail()

Unnamed: 0,date,station,line,influx,district,ints,Lat,Long,day,month,year,weekday,day_off,season,summer_vacations
973098,2023-08-31,Hidalgo,Línea 2,31751,Cuauhtémoc,2,19.437547,-99.147162,31,Agosto,2023,3,False,summer,False
973102,2023-08-31,Pino Suárez,Línea 2,43165,Cuauhtémoc,2,19.424618,-99.13296,31,Agosto,2023,3,False,summer,False
973113,2023-08-31,Zócalo/Tenochtitlan,Línea 2,47789,Cuauhtémoc,1,19.432603,-99.132223,31,Agosto,2023,3,False,summer,False
973123,2023-08-31,Hidalgo,Línea 3,11598,Cuauhtémoc,2,19.437547,-99.147162,31,Agosto,2023,3,False,summer,False
973186,2023-08-31,Bellas Artes,Línea 8,19623,Cuauhtémoc,2,19.436582,-99.14161,31,Agosto,2023,3,False,summer,False


In [13]:
influx_df['weekday'] = pd.to_datetime(
    influx_df['weekday'], format='%w').dt.day_name()

In [14]:
day_map = {1: '1st', 2: '2nd', 3: '3rd'}
for i in range(4, 32):
    day_map[i] = f'{i}th'

In [15]:
influx_df['day'] = influx_df['day'].replace(day_map)

In [16]:
influx_df['year'] = influx_df['year'].apply(lambda x: f'year_{x}')

In [17]:
influx_df.head()

Unnamed: 0,date,station,line,influx,district,ints,Lat,Long,day,month,year,weekday,day_off,season,summer_vacations
3,2010-01-01,Pino Suárez,Línea 1,8679,Cuauhtémoc,2,19.424618,-99.13296,1st,Enero,year_2010,Monday,True,winter,False
42,2010-01-01,Hidalgo,Línea 3,15418,Cuauhtémoc,2,19.437547,-99.147162,1st,Enero,year_2010,Monday,True,winter,False
95,2010-01-01,Bellas Artes,Línea 2,16042,Cuauhtémoc,2,19.436582,-99.14161,1st,Enero,year_2010,Monday,True,winter,False
114,2010-01-01,Bellas Artes,Línea 8,13118,Cuauhtémoc,2,19.436582,-99.14161,1st,Enero,year_2010,Monday,True,winter,False
117,2010-01-01,Allende,Línea 2,6365,Cuauhtémoc,1,19.435916,-99.136854,1st,Enero,year_2010,Monday,True,winter,False


In [18]:
# location dumiies
district_dummies = pd.get_dummies(influx_df["district"])
line_dummies = pd.get_dummies(influx_df["line"])
station_dummies = pd.get_dummies(influx_df["station"])
# date dummies
month_dummies = pd.get_dummies(influx_df["month"])
year_dummies = pd.get_dummies(influx_df["year"])
day_dummies = pd.get_dummies(influx_df["day"])
weekday_dummies = pd.get_dummies(influx_df["weekday"])
season_dummies = pd.get_dummies(influx_df['season'])

In [19]:
# one hot encoding
dummies_influx_df = pd.concat([influx_df, line_dummies, district_dummies,
                              station_dummies, month_dummies, year_dummies,
                              day_dummies, weekday_dummies, season_dummies], axis=1)

In [20]:
dummies_influx_df.tail()

Unnamed: 0,date,station,line,influx,district,ints,Lat,Long,day,month,...,5th,6th,7th,8th,9th,Monday,autumn,spring,summer,winter
973098,2023-08-31,Hidalgo,Línea 2,31751,Cuauhtémoc,2,19.437547,-99.147162,31th,Agosto,...,0,0,0,0,0,1,0,0,1,0
973102,2023-08-31,Pino Suárez,Línea 2,43165,Cuauhtémoc,2,19.424618,-99.13296,31th,Agosto,...,0,0,0,0,0,1,0,0,1,0
973113,2023-08-31,Zócalo/Tenochtitlan,Línea 2,47789,Cuauhtémoc,1,19.432603,-99.132223,31th,Agosto,...,0,0,0,0,0,1,0,0,1,0
973123,2023-08-31,Hidalgo,Línea 3,11598,Cuauhtémoc,2,19.437547,-99.147162,31th,Agosto,...,0,0,0,0,0,1,0,0,1,0
973186,2023-08-31,Bellas Artes,Línea 8,19623,Cuauhtémoc,2,19.436582,-99.14161,31th,Agosto,...,0,0,0,0,0,1,0,0,1,0


In [21]:
dummies_influx_df['date'] = pd.to_datetime(dummies_influx_df['date'])

In [22]:
influx_2010_2020 = dummies_influx_df[(dummies_influx_df['date'].dt.year >= 2010) & (
    dummies_influx_df['date'].dt.year <= 2019)]

In [23]:
# droping columns
influx_2010_2020 = influx_2010_2020.drop(columns=['date', 'station', 'line', 'district', 'ints', 'Lat', 'Long',
                                                  'day', 'month', 'year', 'weekday', 'day_off', 'season', 'summer_vacations'])

In [24]:
influx_2010_2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29121 entries, 3 to 712080
Data columns (total 73 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   influx               29121 non-null  int64
 1   Línea 1              29121 non-null  uint8
 2   Línea 2              29121 non-null  uint8
 3   Línea 3              29121 non-null  uint8
 4   Línea 8              29121 non-null  uint8
 5   Cuauhtémoc           29121 non-null  uint8
 6   Allende              29121 non-null  uint8
 7   Bellas Artes         29121 non-null  uint8
 8   Hidalgo              29121 non-null  uint8
 9   Pino Suárez          29121 non-null  uint8
 10  Zócalo/Tenochtitlan  29121 non-null  uint8
 11  Abril                29121 non-null  uint8
 12  Agosto               29121 non-null  uint8
 13  Diciembre            29121 non-null  uint8
 14  Enero                29121 non-null  uint8
 15  Febrero              29121 non-null  uint8
 16  Julio                

In [25]:
influx_2010_2020.describe()

Unnamed: 0,influx,Línea 1,Línea 2,Línea 3,Línea 8,Cuauhtémoc,Allende,Bellas Artes,Hidalgo,Pino Suárez,...,5th,6th,7th,8th,9th,Monday,autumn,spring,summer,winter
count,29121.0,29121.0,29121.0,29121.0,29121.0,29121.0,29121.0,29121.0,29121.0,29121.0,...,29121.0,29121.0,29121.0,29121.0,29121.0,29121.0,29121.0,29121.0,29121.0,29121.0
mean,32818.48714,0.125133,0.624601,0.125133,0.125133,1.0,0.125133,0.250266,0.250266,0.250266,...,0.032932,0.032932,0.032932,0.032966,0.032966,1.0,0.246867,0.252739,0.252739,0.247656
std,17894.593827,0.330875,0.484234,0.330875,0.330875,0.0,0.330875,0.433174,0.433174,0.433174,...,0.178461,0.178461,0.178461,0.17855,0.17855,0.0,0.431196,0.43459,0.43459,0.431659
min,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,21542.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,28320.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,36929.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
max,157039.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor

In [27]:
mode_data_test2 = influx_2010_2020.copy()

In [28]:
# split data
y = mode_data_test2["influx"]
X = mode_data_test2.drop(columns="influx")
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y)

In [29]:
X_scaler = StandardScaler()
X_scaler = X_scaler.fit(X_train)
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [30]:
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train_scaled, y_train)
predictions_lr_a = model_tree.predict(X_test_scaled)
lr_a_r2_score_test = r2_score(y_test, predictions_lr_a)
lr_a_mape_test = mean_absolute_percentage_error(y_test, predictions_lr_a)
print("R_sqr: {}, mean_absolut_percentage_error: {}".format(
    lr_a_r2_score_test, lr_a_mape_test))
predictions_model = model_tree.predict(X_test)
resultados = pd.DataFrame({'Real': y_test, 'Estimado': predictions_model})

R_sqr: 0.6761240950589228, mean_absolut_percentage_error: 1.7756532046421178




In [31]:
resultados.tail(15)

Unnamed: 0,Real,Estimado
526035,27987,37085.0
170140,34740,28902.0
103279,52519,30718.0
342290,40613,39025.0
248766,24250,38112.0
87885,16625,28902.0
536376,73955,30718.0
478080,14274,39025.0
118671,54100,31751.0
315047,70433,30718.0


In [32]:
model_forest = RandomForestRegressor()
model_forest.fit(X_train_scaled, y_train)
predictions_lr_a = model_forest.predict(X_test_scaled)
lr_a_r2_score_test = r2_score(y_test, predictions_lr_a)
lr_a_mape_test = mean_absolute_percentage_error(y_test, predictions_lr_a)
print("R_sqr: {}, mean_absolut_percentage_error: {}".format(
    lr_a_r2_score_test, lr_a_mape_test))
predictions_model = model_forest.predict(X_test)
resultados = pd.DataFrame({'Real': y_test, 'Estimado': predictions_model})

R_sqr: 0.7815043078378431, mean_absolut_percentage_error: 1.7545031985428408




In [33]:
resultados.tail(15)

Unnamed: 0,Real,Estimado
526035,27987,30591.64
170140,34740,35668.98
103279,52519,28288.4
342290,40613,37748.3
248766,24250,36878.6
87885,16625,35668.98
536376,73955,28288.4
478080,14274,37748.3
118671,54100,28758.27
315047,70433,28288.4


In [34]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [35]:
model = keras.Sequential(
    [
        layers.Dense(64, activation="relu",
                     input_shape=(X_train_scaled.shape[1],)),
        layers.Dense(1),
    ]
)

In [36]:
model.compile(loss="mean_absolute_percentage_error", optimizer="adam")

In [37]:
history = model.fit(X_train_scaled, y_train, epochs=100,
                    batch_size=32, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [38]:
test_scores = model.evaluate(X_test_scaled, y_test, verbose=2)
print("Test loss:", test_scores)

228/228 - 0s - loss: 78.6435 - 192ms/epoch - 841us/step
Test loss: 78.64353942871094


In [39]:
predictions_model = model.predict(X_test_scaled)



In [48]:
data = {"Predicted": np.array(
    predictions_model).flatten(), "True": np.array(y_test).flatten()}

In [49]:
df = pd.DataFrame(data)

In [50]:
df.tail(15)

Unnamed: 0,Predicted,True
7266,26585.484375,27987
7267,25445.212891,34740
7268,21088.978516,52519
7269,24201.298828,40613
7270,25656.447266,24250
7271,17266.599609,16625
7272,21522.878906,73955
7273,2367.384277,14274
7274,4015.830322,54100
7275,19795.222656,70433
