In [1]:
import pandas as pd
from pathlib import Path

In [2]:
influx_path = Path("Outputs/influx.csv")
stations_path = Path("Outputs/stations.csv")
dates_path = Path('Outputs/dates.csv')

In [3]:
influx_df = pd.read_csv(influx_path)
stations_df = pd.read_csv(stations_path)
dates_df = pd.read_csv(dates_path)

In [4]:
influx_df = influx_df[['date', 'station', 'line', 'influx']]
influx_df.head()

Unnamed: 0,date,station,line,influx
0,2010-01-01,Zaragoza,Línea 1,20227
1,2010-01-01,Isabel la Católica,Línea 1,6487
2,2010-01-01,Moctezuma,Línea 1,10304
3,2010-01-01,Pino Suárez,Línea 1,8679
4,2010-01-01,Gómez Farías,Línea 1,19499


In [5]:
stations_df = stations_df[['station', 'district', 'ints', 'Lat', 'Long']]
stations_df.head()

Unnamed: 0,station,district,ints,Lat,Long
0,Zaragoza,Venustiano Carranza,1,19.41309,-99.082348
1,Isabel la Católica,Cuauhtémoc,1,19.427258,-99.137948
2,Moctezuma,Venustiano Carranza,1,19.427017,-99.109746
3,Pino Suárez,Cuauhtémoc,2,19.424618,-99.13296
4,Gómez Farías,Venustiano Carranza,1,19.416672,-99.090221


In [6]:
dates_df = dates_df[['date', 'day', 'month', 'year', 'weekday', 'day_off',
                     'season', 'summer_vacations']]
dates_df.head()

Unnamed: 0,date,day,month,year,weekday,day_off,season,summer_vacations
0,2010-01-01,1,Enero,2010,4,True,4,False
1,2010-01-02,2,Enero,2010,5,False,4,False
2,2010-01-03,3,Enero,2010,6,False,4,False
3,2010-01-04,4,Enero,2010,0,False,4,False
4,2010-01-05,5,Enero,2010,1,False,4,False


In [7]:
influx_df = pd.merge(influx_df, stations_df, on="station", how="left")

In [8]:
influx_df = pd.merge(influx_df, dates_df, on="date", how="left")

In [9]:
influx_df.tail()

Unnamed: 0,date,station,line,influx,district,ints,Lat,Long,day,month,year,weekday,day_off,season,summer_vacations
973240,2023-08-31,Romero Rubio,Línea B,14861,Venustiano Carranza,1,19.440864,-99.094576,31,Agosto,2023,3,False,2,False
973241,2023-08-31,Río de los Remedios,Línea B,15842,Gustavo A. Madero,1,19.490807,-99.046518,31,Agosto,2023,3,False,2,False
973242,2023-08-31,San Lázaro,Línea B,24833,Cuauhtémoc,2,19.432183,-99.113614,31,Agosto,2023,3,False,2,False
973243,2023-08-31,Tepito,Línea B,19832,Cuauhtémémoc,1,19.443016,-99.12417,31,Agosto,2023,3,False,2,False
973244,2023-08-31,Villa de Aragón,Línea B,15716,Gustavo A. Madero,1,19.461826,-99.061329,31,Agosto,2023,3,False,2,False


In [10]:
# location dumiies
district_dummies = pd.get_dummies(influx_df["district"])
line_dummies = pd.get_dummies(influx_df["line"])
station_dummies = pd.get_dummies(influx_df["station"])
# date dummies
month_dummies = pd.get_dummies(influx_df["month"])
year_dummies = pd.get_dummies(influx_df["year"])
day_dummies = pd.get_dummies(influx_df["day"])

In [26]:
# one hot encoding
dummies_influx_df = pd.concat([influx_df, line_dummies, district_dummies,
                              station_dummies, month_dummies, year_dummies, day_dummies], axis=1)

In [27]:
dummies_influx_df.tail()

Unnamed: 0,date,station,line,influx,district,ints,Lat,Long,day,month,...,22,23,24,25,26,27,28,29,30,31
973240,2023-08-31,Romero Rubio,Línea B,14861,Venustiano Carranza,1,19.440864,-99.094576,31,Agosto,...,0,0,0,0,0,0,0,0,0,1
973241,2023-08-31,Río de los Remedios,Línea B,15842,Gustavo A. Madero,1,19.490807,-99.046518,31,Agosto,...,0,0,0,0,0,0,0,0,0,1
973242,2023-08-31,San Lázaro,Línea B,24833,Cuauhtémoc,2,19.432183,-99.113614,31,Agosto,...,0,0,0,0,0,0,0,0,0,1
973243,2023-08-31,Tepito,Línea B,19832,Cuauhtémémoc,1,19.443016,-99.12417,31,Agosto,...,0,0,0,0,0,0,0,0,0,1
973244,2023-08-31,Villa de Aragón,Línea B,15716,Gustavo A. Madero,1,19.461826,-99.061329,31,Agosto,...,0,0,0,0,0,0,0,0,0,1


In [32]:
cols = dummies_influx_df.columns

columns = cols.to_list
columns

<bound method IndexOpsMixin.tolist of Index([    'date',  'station',     'line',   'influx', 'district',     'ints',
            'Lat',     'Long',      'day',    'month',
       ...
               22,         23,         24,         25,         26,         27,
               28,         29,         30,         31],
      dtype='object', length=263)>

In [33]:
influx_df.columns

Index(['date', 'station', 'line', 'influx', 'district', 'ints', 'Lat', 'Long',
       'day', 'month', 'year', 'weekday', 'day_off', 'season',
       'summer_vacations'],
      dtype='object')

In [34]:
# droping columns
dummies_influx_df = dummies_influx_df.drop(columns=['date', 'station', 'line', 'district', 'ints', 'Lat', 'Long',
                                                    'day', 'month', 'year', 'weekday', 'day_off', 'season', 'summer_vacations'])

In [38]:
dummies_influx_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 973245 entries, 0 to 973244
Columns: 249 entries, influx to 31
dtypes: int64(1), uint8(248)
memory usage: 245.0 MB


In [37]:
dummies_influx_df.describe()

Unnamed: 0,influx,Línea 1,Línea 12,Línea 2,Línea 3,Línea 4,Línea 5,Línea 6,Línea 7,Línea 8,...,22,23,24,25,26,27,28,29,30,31
count,973245.0,973245.0,973245.0,973245.0,973245.0,973245.0,973245.0,973245.0,973245.0,973245.0,...,973245.0,973245.0,973245.0,973245.0,973245.0,973245.0,973245.0,973245.0,973245.0,973245.0
mean,20173.656194,0.102564,0.102564,0.123077,0.107692,0.051282,0.066667,0.05641,0.071795,0.097436,...,0.032859,0.032859,0.032859,0.032859,0.032859,0.032859,0.032859,0.030655,0.030054,0.019235
std,20016.382395,0.303389,0.303389,0.328526,0.309992,0.220573,0.249444,0.230712,0.258148,0.296551,...,0.178268,0.178268,0.178268,0.178268,0.178268,0.178268,0.178268,0.172382,0.170736,0.137349
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7492.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,14739.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,26168.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,263056.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [56]:
old_names = list(dummies_influx_df.columns)

In [57]:
dummies_influx_df[[11, 12]].head()

Unnamed: 0,11,12
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [58]:
new_names = ['influx',
             'Línea 1',
             'Línea 12',
             'Línea 2',
             'Línea 3',
             'Línea 4',
             'Línea 5',
             'Línea 6',
             'Línea 7',
             'Línea 8',
             'Línea 9',
             'Línea A',
             'Línea B',
             'Azcapotzalco',
             'Benito Juárez',
             'Coyoacán',
             'Cuajimalpa',
             'Cuauhtémoc',
             'Cuauhtémémoc',
             'Ecatepec',
             'Gustavo A. Madero',
             'Iztacalco',
             'Iztapalapa',
             'La Paz (Estado de México)',
             'Miguel Hidalgo',
             'Nezahualcóyotl',
             'Tláhuac',
             'Venustiano Carranza',
             'Álvaro Obregón',
             'Acatitla',
             'Aculco',
             'Agrícola Oriental',
             'Allende',
             'Apatlaco',
             'Aquiles Serdán',
             'Aragón',
             'Atlalilco',
             'Auditorio',
             'Autobuses del Norte',
             'Balbuena',
             'Balderas',
             'Barranca del Muerto',
             'Bellas Artes',
             'Bondojito',
             'Bosque de Aragón',
             'Boulevard Puerto Aéreo',
             'Buenavista',
             'Calle 11',
             'Camarones',
             'Canal de San Juan',
             'Canal del Norte',
             'Candelaria',
             'Centro Médico',
             'Cerro de la Estrella',
             'Chabacano',
             'Chapultepec',
             'Chilpancingo',
             'Ciudad Azteca',
             'Ciudad Deportiva',
             'Colegio Militar',
             'Constitución de 1917',
             'Constituyentes',
             'Consulado',
             'Copilco',
             'Coyoacán',
             'Coyuya',
             'Cuatro Caminos',
             'Cuauhtémoc',
             'Cuitláhuac',
             'Culhuacán',
             'Deportivo 18 de Marzo',
             'Deportivo Oceanía',
             'División del Norte',
             'Doctores',
             'Ecatepec',
             'Eduardo Molina',
             'Eje Central',
             'El Rosario',
             'Ermita',
             'Escuadrón 201',
             'Etiopía/Plaza de la Transparencia',
             'Eugenia',
             'Ferrería/Arena Ciudad de México',
             'Fray Servando',
             'Garibaldi/Lagunilla',
             'General Anaya',
             'Guelatao',
             'Guerrero',
             'Gómez Farías',
             'Hangares',
             'Hidalgo',
             'Hospital 20 de Noviembre',
             'Hospital General',
             'Impulsora',
             'Indios Verdes',
             'Instituto del Petróleo',
             'Insurgentes',
             'Insurgentes Sur',
             'Isabel la Católica',
             'Iztacalco',
             'Iztapalapa',
             'Jamaica',
             'Juanacatlán',
             'Juárez',
             'La Paz',
             'La Raza',
             'La Viga',
             'La Villa/Basílica',
             'Lagunilla',
             'Lindavista',
             'Lomas Estrella',
             'Los Reyes',
             'Lázaro Cárdenas',
             'Martín Carrera',
             'Merced',
             'Mexicaltzingo',
             'Miguel Ángel de Quevedo',
             'Misterios',
             'Mixcoac',
             'Mixiuhca',
             'Moctezuma',
             'Morelos',
             'Múzquiz',
             'Nativitas',
             'Nezahualcóyotl',
             'Niños Héroes',
             'Nopalera',
             'Normal',
             'Norte 45',
             'Obrera',
             'Observatorio',
             'Oceanía',
             'Olivos',
             'Olímpica',
             'Panteones',
             'Pantitlán',
             'Parque de los Venados',
             'Patriotismo',
             'Periférico Oriente',
             'Peñón Viejo',
             'Pino Suárez',
             'Plaza Aragón',
             'Polanco',
             'Politécnico',
             'Popotla',
             'Portales',
             'Potrero',
             'Puebla',
             'Refinería',
             'Revolución',
             'Ricardo Flores Magón',
             'Romero Rubio',
             'Río de los Remedios',
             'Salto del Agua',
             'San Andrés Tomatlán',
             'San Antonio',
             'San Antonio Abad',
             'San Cosme',
             'San Joaquín',
             'San Juan de Letrán',
             'San Lázaro',
             'San Pedro de los Pinos',
             'Santa Anita',
             'Santa Marta',
             'Sevilla',
             'Tacuba',
             'Tacubaya',
             'Talismán',
             'Tasqueña',
             'Tepalcates',
             'Tepito',
             'Terminal Aérea',
             'Tezonco',
             'Tezozómoc',
             'Tlaltenco',
             'Tlatelolco',
             'Tláhuac',
             'UAM-Azcapotzalco',
             'UAM-I',
             'Universidad',
             'Valle Gómez',
             'Vallejo',
             'Velódromo',
             'Viaducto',
             'Villa de Aragón',
             'Villa de Cortés',
             'Viveros/Derechos Humanos',
             'Xola',
             'Zapata',
             'Zapotitlán',
             'Zaragoza',
             'Zócalo/Tenochtitlan',
             'Abril',
             'Agosto',
             'Diciembre',
             'Enero',
             'Febrero',
             'Julio',
             'Junio',
             'Marzo',
             'Mayo',
             'Noviembre',
             'Octubre',
             'Septiembre',
             'year_2010',
             'year_2011',
             'year_2012',
             'year_2013',
             'year_2014',
             'year_2015',
             'year_2016',
             'year_2017',
             'year_2018',
             'year_2019',
             'year_2020',
             'year_2021',
             'year_2022',
             'year_2023',
             '1st',
             '2nd',
             '3rd',
             '4th',
             '5th',
             '6th',
             '7th',
             '8th',
             '9th',
             '10th',
             '11th',
             '12th',
             '13th',
             '14th',
             '15th',
             '16th',
             '17th',
             '18th',
             '19th',
             '20th',
             '21th',
             '22th',
             '23th',
             '24th',
             '25th',
             '26th',
             '27th',
             '28th',
             '29th',
             '30th',
             '31th']

In [65]:
dict_names = [{key: value} for key, value in zip(old_names, new_names)]
dict_final = {}
for _dict in dict_names:
    dict_final.update(_dict)

In [68]:
print(dict_final)

{'influx': 'influx', 'Línea 1': 'Línea 1', 'Línea 12': 'Línea 12', 'Línea 2': 'Línea 2', 'Línea 3': 'Línea 3', 'Línea 4': 'Línea 4', 'Línea 5': 'Línea 5', 'Línea 6': 'Línea 6', 'Línea 7': 'Línea 7', 'Línea 8': 'Línea 8', 'Línea 9': 'Línea 9', 'Línea A': 'Línea A', 'Línea B': 'Línea B', 'Azcapotzalco': 'Azcapotzalco', 'Benito Juárez': 'Benito Juárez', 'Coyoacán': 'Coyoacán', 'Cuajimalpa': 'Cuajimalpa', 'Cuauhtémoc': 'Cuauhtémoc', 'Cuauhtémémoc': 'Cuauhtémémoc', 'Ecatepec': 'Ecatepec', 'Gustavo A. Madero': 'Gustavo A. Madero', 'Iztacalco': 'Iztacalco', 'Iztapalapa': 'Iztapalapa', 'La Paz (Estado de México)': 'La Paz (Estado de México)', 'Miguel Hidalgo': 'Miguel Hidalgo', 'Nezahualcóyotl': 'Nezahualcóyotl', 'Tláhuac': 'Tláhuac', 'Venustiano Carranza': 'Venustiano Carranza', 'Álvaro Obregón': 'Álvaro Obregón', 'Acatitla': 'Acatitla', 'Aculco': 'Aculco', 'Agrícola Oriental': 'Agrícola Oriental', 'Allende': 'Allende', 'Apatlaco': 'Apatlaco', 'Aquiles Serdán': 'Aquiles Serdán', 'Aragón': 'Ar

In [70]:
dummies_influx_df = dummies_influx_df.rename(
    columns=dict_final)

In [71]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_percentage_error

In [81]:
from sklearn.ensemble import RandomForestRegressor

In [74]:
mode_data_test2 = dummies_influx_df.copy()

In [75]:
# split data
y = mode_data_test2["influx"]
X = mode_data_test2.drop(columns="influx")
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y)

In [76]:
X_scaler = StandardScaler()
X_scaler = X_scaler.fit(X_train)
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [80]:
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train_scaled, y_train)
predictions_lr_a = model_tree.predict(X_test_scaled)
lr_a_r2_score_test = r2_score(y_test, predictions_lr_a)
lr_a_mape_test = mean_absolute_percentage_error(y_test, predictions_lr_a)
print("R_sqr: {}, mean_absolut_percentage_error: {}".format(
    lr_a_r2_score_test, lr_a_mape_test))
predictions_model = model_tree.predict(X_test)
resultados = pd.DataFrame({'Real': y_test, 'Estimado': predictions_model})
resultados.tail()

R_sqr:0.8084732871030842, mean_absolut_percentage_error1.83788315119781e+17




Unnamed: 0,Real,Estimado
908205,3961,17216.0
312691,8976,17216.0
34479,9556,17216.0
641020,6312,17216.0
644586,8258,17216.0


In [82]:
model_tree = RandomForestRegressor()
model_tree.fit(X_train_scaled, y_train)
predictions_lr_a = model_tree.predict(X_test_scaled)
lr_a_r2_score_test = r2_score(y_test, predictions_lr_a)
lr_a_mape_test = mean_absolute_percentage_error(y_test, predictions_lr_a)
print("R_sqr: {}, mean_absolut_percentage_error: {}".format(
    lr_a_r2_score_test, lr_a_mape_test))
predictions_model = model_tree.predict(X_test)
resultados = pd.DataFrame({'Real': y_test, 'Estimado': predictions_model})