In [1]:
# Importar el módulo pandas
import pandas as pd
import numpy as np

In [2]:
# Definir el path del archivo csv
path = "HistorialCanchasTenis.csv"

In [3]:
# Lee el archivo excel con pandas y guardar el resultado en un dataframe
df = pd.read_excel("Canchas 0.2.xlsx", sheet_name="Hoja1")

In [4]:
# Determine the number of datasets rows and columns
df.columns

Index(['reserva ', 'fecha', 'hora', 'servicio', 'mza-lote', 'casa',
       'Unnamed: 6'],
      dtype='object')

In [5]:
df.dtypes

reserva                int64
fecha         datetime64[ns]
hora                  object
servicio              object
mza-lote              object
casa                  object
Unnamed: 6            object
dtype: object

In [6]:
# Imprimir los 5 primeros líneas del archivo
df

Unnamed: 0,reserva,fecha,hora,servicio,mza-lote,casa,Unnamed: 6
0,8219516,2023-07-08,16:00:00,Tenis - Cancha 2,APROBADA,M-03,
1,8218607,2023-07-08,16:00:00,Tenis - Cancha 1,APROBADA,F-04,
2,8217992,2023-07-08,13:00:00,Tenis - Cancha 1,APROBADA,G-12,
3,8212103,2023-07-07,11:00:00,Tenis - Cancha 1,APROBADA,A-52,
4,8196830,2023-07-04,16:00:00,Tenis - Cancha 1,APROBADA,K-04,
...,...,...,...,...,...,...,...
349,1712511,2020-12-04,18:00 CON LUZ,Tenis - Cancha 1,APROBADA,PRUEBA,
350,1712222,2020-12-04,17:00:00,Tenis - Cancha 1,APROBADA,E-08,
351,1687608,2020-12-02,19:00 CON LUZ,Tenis - Cancha 2,APROBADA,H-07,
352,1687243,2020-12-02,18:00 CON LUZ,Tenis - Cancha 1,APROBADA,M-03,


<h4>Statistical Summary</h4>
df.describe()

In [7]:
df.describe(include = 'all')

  df.describe(include = 'all')


Unnamed: 0,reserva,fecha,hora,servicio,mza-lote,casa,Unnamed: 6
count,354.0,354,354,354,354,353,2
unique,,262,13,2,1,43,1
top,,2023-03-14 00:00:00,18:00 CON LUZ,Tenis - Cancha 1,APROBADA,G-07,18 a 24hs. (Únicamente LOS DOMINGOS)
freq,,14,63,309,354,57,2
first,,2020-11-26 00:00:00,,,,,
last,,2023-07-08 00:00:00,,,,,
mean,5279749.0,,,,,,
std,2188869.0,,,,,,
min,37.0,,,,,,
25%,3639766.0,,,,,,


<h2>Evaluating for Missing Data</h2>

The missing values are converted by default. We use the following functions to identify these missing values. There are two methods to detect missing data:
<ol>
    <li><b>.isnull()</b></li>
    <li><b>.notnull()</b></li>
</ol>
The output is a boolean value indicating whether the value that is passed into the argument is in fact missing data.


<h3 id="deal_missing_values">Deal with missing data</h3>
<b>How to deal with missing data?</b>

<ol>
    <li>Drop data<br>
        a. Drop the whole row<br>
        b. Drop the whole column
    </li>
    <li>Replace data<br>
        a. Replace it by mean<br>
        b. Replace it by frequency<br>
        c. Replace it based on other functions
    </li>
</ol>

In [8]:
# Busco nulos en el dadtaframe
missing_data = df.isnull()
missing_data.head(5)
# Si aparecen nulos son True si aparecen no nulos retorna False

Unnamed: 0,reserva,fecha,hora,servicio,mza-lote,casa,Unnamed: 6
0,False,False,False,False,False,False,True
1,False,False,False,False,False,False,True
2,False,False,False,False,False,False,True
3,False,False,False,False,False,False,True
4,False,False,False,False,False,False,True


Arriba se ve la columna 'Unnamed 6' con valores nulos.
Sin embargo más adelante (abajo) encuentro un nulo mas en 'casa' con otra forma de controlarlo.

In [9]:
# Find unnamed columns
unnamed_columns = df.columns[df.columns == '']
unnamed_columns
# Si todas las columnas tienen nombre retorna false Index([], dtype='object'

Index([], dtype='object')

In [10]:
# Find unnamed columns
unnamed_columns = df.columns[df.columns != '']
unnamed_columns
# Si todas las columnas tienen nombre retorna los nombres de las columnas, dtype='object'

Index(['reserva ', 'fecha', 'hora', 'servicio', 'mza-lote', 'casa',
       'Unnamed: 6'],
      dtype='object')

Entonces busco que columnas tiene elementos nulos (Cuales columnas)

In [11]:
# Check if there is null values in each column
df.isna().any(axis=0)

reserva       False
fecha         False
hora          False
servicio      False
mza-lote      False
casa           True
Unnamed: 6     True
dtype: bool

In [12]:
# Check if any column has null values and sum
df.isna().sum(axis=0)

reserva         0
fecha           0
hora            0
servicio        0
mza-lote        0
casa            1
Unnamed: 6    352
dtype: int64

In [13]:
# Check if any column has null values and sum
df.isna().count(axis=0)

reserva       354
fecha         354
hora          354
servicio      354
mza-lote      354
casa          354
Unnamed: 6    354
dtype: int64

In [14]:
# Check the columns names
df.columns

Index(['reserva ', 'fecha', 'hora', 'servicio', 'mza-lote', 'casa',
       'Unnamed: 6'],
      dtype='object')

In [15]:
# Check the name of the column before delete it
print(df['Unnamed: 6'].head())

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: Unnamed: 6, dtype: object


In [16]:
# Drop the Empty column
df = df.drop('Unnamed: 6', axis=1)

In [17]:
# Check the columns that still exist
df.columns

Index(['reserva ', 'fecha', 'hora', 'servicio', 'mza-lote', 'casa'], dtype='object')

# Find columns with only NaN values # Ahora chequeo si hay alguna columna completa con nulos
columns_with_nulls = df.columns[df.isna().all()]
print(columns_with_nulls)

In [18]:
# Check if any column has null values and sum
df.isna().sum(axis=0)

reserva     0
fecha       0
hora        0
servicio    0
mza-lote    0
casa        1
dtype: int64

Arriba se ve que casa niene un elemento nulo.

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   reserva   354 non-null    int64         
 1   fecha     354 non-null    datetime64[ns]
 2   hora      354 non-null    object        
 3   servicio  354 non-null    object        
 4   mza-lote  354 non-null    object        
 5   casa      353 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 16.7+ KB


In [20]:
df['casa'].describe()

count      353
unique      43
top       G-07
freq        57
Name: casa, dtype: object

Arriba también se ve que hay un elemento nulo podrque 353 es uno menos de 354 (el total del filas)

Looking for NaN element in 'casa' using boolean mask

In [21]:
# Filter obtaining null element row of the dataframe
df_row_with_null =df[df['casa'].isnull()]

In [22]:
# Show the row with null in casa
df_row_with_null

Unnamed: 0,reserva,fecha,hora,servicio,mza-lote,casa
306,2753105,2021-04-05,19:00 CON LUZ,Tenis - Cancha 1,APROBADA,


Se ve que la fila con casa NaN es la 306

In [23]:
df_without_null =df[df['casa'].notnull()]

In [24]:
df_without_null

Unnamed: 0,reserva,fecha,hora,servicio,mza-lote,casa
0,8219516,2023-07-08,16:00:00,Tenis - Cancha 2,APROBADA,M-03
1,8218607,2023-07-08,16:00:00,Tenis - Cancha 1,APROBADA,F-04
2,8217992,2023-07-08,13:00:00,Tenis - Cancha 1,APROBADA,G-12
3,8212103,2023-07-07,11:00:00,Tenis - Cancha 1,APROBADA,A-52
4,8196830,2023-07-04,16:00:00,Tenis - Cancha 1,APROBADA,K-04
...,...,...,...,...,...,...
349,1712511,2020-12-04,18:00 CON LUZ,Tenis - Cancha 1,APROBADA,PRUEBA
350,1712222,2020-12-04,17:00:00,Tenis - Cancha 1,APROBADA,E-08
351,1687608,2020-12-02,19:00 CON LUZ,Tenis - Cancha 2,APROBADA,H-07
352,1687243,2020-12-02,18:00 CON LUZ,Tenis - Cancha 1,APROBADA,M-03


In [25]:
# Check if any column has null values and sum
df_without_null.isna().sum(axis=0)

reserva     0
fecha       0
hora        0
servicio    0
mza-lote    0
casa        0
dtype: int64

In [26]:
df= df_without_null # set df with df_withot_null, becaouse tpyping df is short and easy to work
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 353 entries, 0 to 353
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   reserva   353 non-null    int64         
 1   fecha     353 non-null    datetime64[ns]
 2   hora      353 non-null    object        
 3   servicio  353 non-null    object        
 4   mza-lote  353 non-null    object        
 5   casa      353 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 19.3+ KB


<b>Missing Data Dealing Successfull</b>

# Correct data format

<p>Next step in data cleaning is checking and making sure that all data is in the correct format (int, float, text or other).</p>
​
I use:
<ol>
    <li><b>.astype()</b></li>
    <li><b>.dtype()</b></li>
</ol>

In [27]:
# Check data types
df.dtypes

reserva              int64
fecha       datetime64[ns]
hora                object
servicio            object
mza-lote            object
casa                object
dtype: object

<p>As we can see above, some columns are not of the correct data type.</p>
<p>Numerical variables should have type 'float' or 'int', and variables with strings such as categories should have type 'object'.</p>
<p> 'reserva', 'fecha' are ok.</p>
<P>I will convert data types into a proper format for each column using the "astype()" method.</p> 

# Date and time correction is needed
Es necesario corregir fecha y hora de alquiler de las canchas

Fecha y hora están separadas. Fecha está en formato correcto pero sin la hora. Hora está con formato incorrecto. Hora tiene ademas la indicación de CON LUZ.

Es necesario juntar quitar "CON LUZ" de hora, juntar fecha y hora y dejarlas en formato datetime64[ns]

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 353 entries, 0 to 353
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   reserva   353 non-null    int64         
 1   fecha     353 non-null    datetime64[ns]
 2   hora      353 non-null    object        
 3   servicio  353 non-null    object        
 4   mza-lote  353 non-null    object        
 5   casa      353 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 19.3+ KB


In [29]:
# Re-Check 'fecha' format
df['fecha']

0     2023-07-08
1     2023-07-08
2     2023-07-08
3     2023-07-07
4     2023-07-04
         ...    
349   2020-12-04
350   2020-12-04
351   2020-12-02
352   2020-12-02
353   2020-11-26
Name: fecha, Length: 353, dtype: datetime64[ns]

With df['fecha']:  Can see date without  hours. Se pueden ver las fechas sin las horas.

Convierto la columna de fechas en string para luego juntarla con la columna de horas.

In [30]:
# Convert 'fecha' in str format
df['fecha'].astype(str)

0      2023-07-08
1      2023-07-08
2      2023-07-08
3      2023-07-07
4      2023-07-04
          ...    
349    2020-12-04
350    2020-12-04
351    2020-12-02
352    2020-12-02
353    2020-11-26
Name: fecha, Length: 353, dtype: object

In [31]:
# Re-Check 'hora' type
df['hora'].astype(str)

0           16:00:00
1           16:00:00
2           13:00:00
3           11:00:00
4           16:00:00
           ...      
349    18:00 CON LUZ
350         17:00:00
351    19:00 CON LUZ
352    18:00 CON LUZ
353         16:00:00
Name: hora, Length: 353, dtype: object

Observe "CON LUZ". Extraction and spliting is needed.

In [32]:
split_hora = df['hora'].astype(str)

In [33]:
splited_hora = split_hora.str.split(" ", expand= True)

In [34]:
splited_hora

Unnamed: 0,0,1,2
0,16:00:00,,
1,16:00:00,,
2,13:00:00,,
3,11:00:00,,
4,16:00:00,,
...,...,...,...
349,18:00,CON,LUZ
350,17:00:00,,
351,19:00,CON,LUZ
352,18:00,CON,LUZ


In [35]:
shora = splited_hora[0].astype("string")

In [36]:
shora

0      16:00:00
1      16:00:00
2      13:00:00
3      11:00:00
4      16:00:00
         ...   
349       18:00
350    17:00:00
351       19:00
352       18:00
353    16:00:00
Name: 0, Length: 353, dtype: string

In [37]:
df['fecha']

0     2023-07-08
1     2023-07-08
2     2023-07-08
3     2023-07-07
4     2023-07-04
         ...    
349   2020-12-04
350   2020-12-04
351   2020-12-02
352   2020-12-02
353   2020-11-26
Name: fecha, Length: 353, dtype: datetime64[ns]

In [38]:
sfecha = df['fecha'].astype("string")

In [39]:
sfecha

0      2023-07-08
1      2023-07-08
2      2023-07-08
3      2023-07-07
4      2023-07-04
          ...    
349    2020-12-04
350    2020-12-04
351    2020-12-02
352    2020-12-02
353    2020-11-26
Name: fecha, Length: 353, dtype: string

In [40]:
s_fechayhora = pd.DataFrame(sfecha + " " + shora)

In [41]:
s_fechayhora

Unnamed: 0,0
0,2023-07-08 16:00:00
1,2023-07-08 16:00:00
2,2023-07-08 13:00:00
3,2023-07-07 11:00:00
4,2023-07-04 16:00:00
...,...
349,2020-12-04 18:00
350,2020-12-04 17:00:00
351,2020-12-02 19:00
352,2020-12-02 18:00


In [42]:
fechayhora = pd.to_datetime(s_fechayhora[0])

In [43]:
fechayhora

0     2023-07-08 16:00:00
1     2023-07-08 16:00:00
2     2023-07-08 13:00:00
3     2023-07-07 11:00:00
4     2023-07-04 16:00:00
              ...        
349   2020-12-04 18:00:00
350   2020-12-04 17:00:00
351   2020-12-02 19:00:00
352   2020-12-02 18:00:00
353   2020-11-26 16:00:00
Name: 0, Length: 353, dtype: datetime64[ns]

Date and Time Ready. Time to update the Dataframe under Analysis.

Fecha y Hora listas para actualizar en el Dataframe que se está analizando.

In [44]:
df['fecha'] = fechayhora

In [45]:
# Ubdate the fecha column with date and time
df['fecha']

0     2023-07-08 16:00:00
1     2023-07-08 16:00:00
2     2023-07-08 13:00:00
3     2023-07-07 11:00:00
4     2023-07-04 16:00:00
              ...        
349   2020-12-04 18:00:00
350   2020-12-04 17:00:00
351   2020-12-02 19:00:00
352   2020-12-02 18:00:00
353   2020-11-26 16:00:00
Name: fecha, Length: 353, dtype: datetime64[ns]

In [46]:
# Rename the 'fecha' column with 'fechayhora' new name
# Warning!! df['fecha'] = df['fecha'].rename('fechayhora', inplace = True) does not work.
# Force renaming with df.rename method and inplace = True)
df.rename(columns = {'fecha' : 'fechayhora'}, inplace = True)

In [47]:
df.tail()

Unnamed: 0,reserva,fechayhora,hora,servicio,mza-lote,casa
349,1712511,2020-12-04 18:00:00,18:00 CON LUZ,Tenis - Cancha 1,APROBADA,PRUEBA
350,1712222,2020-12-04 17:00:00,17:00:00,Tenis - Cancha 1,APROBADA,E-08
351,1687608,2020-12-02 19:00:00,19:00 CON LUZ,Tenis - Cancha 2,APROBADA,H-07
352,1687243,2020-12-02 18:00:00,18:00 CON LUZ,Tenis - Cancha 1,APROBADA,M-03
353,1652644,2020-11-26 16:00:00,16:00:00,Tenis - Cancha 1,APROBADA,N-09


<b>Date and Time Correction Successfull!</b> 
<b> Next "CON LUZ" binning process.</b> 

In [48]:
sluz = splited_hora[2].astype("string")

In [49]:
sluz

0      <NA>
1      <NA>
2      <NA>
3      <NA>
4      <NA>
       ... 
349     LUZ
350    <NA>
351     LUZ
352     LUZ
353    <NA>
Name: 2, Length: 353, dtype: string

In [50]:
sluz.fillna("SOL", inplace = True)

In [51]:
sluz

0      SOL
1      SOL
2      SOL
3      SOL
4      SOL
      ... 
349    LUZ
350    SOL
351    LUZ
352    LUZ
353    SOL
Name: 2, Length: 353, dtype: string

In [52]:
cluz = pd.Categorical(sluz, categories=["SOL", "LUZ"], ordered=False)
cluz

['SOL', 'SOL', 'SOL', 'SOL', 'SOL', ..., 'LUZ', 'SOL', 'LUZ', 'LUZ', 'SOL']
Length: 353
Categories (2, object): ['SOL', 'LUZ']

In [53]:
df['iluminacion'] = cluz

In [54]:
df

Unnamed: 0,reserva,fechayhora,hora,servicio,mza-lote,casa,iluminacion
0,8219516,2023-07-08 16:00:00,16:00:00,Tenis - Cancha 2,APROBADA,M-03,SOL
1,8218607,2023-07-08 16:00:00,16:00:00,Tenis - Cancha 1,APROBADA,F-04,SOL
2,8217992,2023-07-08 13:00:00,13:00:00,Tenis - Cancha 1,APROBADA,G-12,SOL
3,8212103,2023-07-07 11:00:00,11:00:00,Tenis - Cancha 1,APROBADA,A-52,SOL
4,8196830,2023-07-04 16:00:00,16:00:00,Tenis - Cancha 1,APROBADA,K-04,SOL
...,...,...,...,...,...,...,...
349,1712511,2020-12-04 18:00:00,18:00 CON LUZ,Tenis - Cancha 1,APROBADA,PRUEBA,LUZ
350,1712222,2020-12-04 17:00:00,17:00:00,Tenis - Cancha 1,APROBADA,E-08,SOL
351,1687608,2020-12-02 19:00:00,19:00 CON LUZ,Tenis - Cancha 2,APROBADA,H-07,LUZ
352,1687243,2020-12-02 18:00:00,18:00 CON LUZ,Tenis - Cancha 1,APROBADA,M-03,LUZ


In [55]:
df = df.drop(columns=['hora'])

In [56]:
df

Unnamed: 0,reserva,fechayhora,servicio,mza-lote,casa,iluminacion
0,8219516,2023-07-08 16:00:00,Tenis - Cancha 2,APROBADA,M-03,SOL
1,8218607,2023-07-08 16:00:00,Tenis - Cancha 1,APROBADA,F-04,SOL
2,8217992,2023-07-08 13:00:00,Tenis - Cancha 1,APROBADA,G-12,SOL
3,8212103,2023-07-07 11:00:00,Tenis - Cancha 1,APROBADA,A-52,SOL
4,8196830,2023-07-04 16:00:00,Tenis - Cancha 1,APROBADA,K-04,SOL
...,...,...,...,...,...,...
349,1712511,2020-12-04 18:00:00,Tenis - Cancha 1,APROBADA,PRUEBA,LUZ
350,1712222,2020-12-04 17:00:00,Tenis - Cancha 1,APROBADA,E-08,SOL
351,1687608,2020-12-02 19:00:00,Tenis - Cancha 2,APROBADA,H-07,LUZ
352,1687243,2020-12-02 18:00:00,Tenis - Cancha 1,APROBADA,M-03,LUZ


In [57]:
splited_servicio = df['servicio'].str.split(" ", expand= True)

In [58]:
splited_servicio

Unnamed: 0,0,1,2,3
0,Tenis,-,Cancha,2
1,Tenis,-,Cancha,1
2,Tenis,-,Cancha,1
3,Tenis,-,Cancha,1
4,Tenis,-,Cancha,1
...,...,...,...,...
349,Tenis,-,Cancha,1
350,Tenis,-,Cancha,1
351,Tenis,-,Cancha,2
352,Tenis,-,Cancha,1


In [59]:
scancha = splited_servicio[2] + " " + splited_servicio [3]

In [60]:
scancha

0      Cancha 2
1      Cancha 1
2      Cancha 1
3      Cancha 1
4      Cancha 1
         ...   
349    Cancha 1
350    Cancha 1
351    Cancha 2
352    Cancha 1
353    Cancha 1
Length: 353, dtype: object

In [61]:
ccancha = pd.Categorical(scancha, categories=["Cancha 1", "Cancha 2"], ordered=False)

In [62]:
ccancha

['Cancha 2', 'Cancha 1', 'Cancha 1', 'Cancha 1', 'Cancha 1', ..., 'Cancha 1', 'Cancha 1', 'Cancha 2', 'Cancha 1', 'Cancha 1']
Length: 353
Categories (2, object): ['Cancha 1', 'Cancha 2']

In [63]:
df['servicio'] = ccancha

In [64]:
df['servicio']

0      Cancha 2
1      Cancha 1
2      Cancha 1
3      Cancha 1
4      Cancha 1
         ...   
349    Cancha 1
350    Cancha 1
351    Cancha 2
352    Cancha 1
353    Cancha 1
Name: servicio, Length: 353, dtype: category
Categories (2, object): ['Cancha 1', 'Cancha 2']

In [65]:
df

Unnamed: 0,reserva,fechayhora,servicio,mza-lote,casa,iluminacion
0,8219516,2023-07-08 16:00:00,Cancha 2,APROBADA,M-03,SOL
1,8218607,2023-07-08 16:00:00,Cancha 1,APROBADA,F-04,SOL
2,8217992,2023-07-08 13:00:00,Cancha 1,APROBADA,G-12,SOL
3,8212103,2023-07-07 11:00:00,Cancha 1,APROBADA,A-52,SOL
4,8196830,2023-07-04 16:00:00,Cancha 1,APROBADA,K-04,SOL
...,...,...,...,...,...,...
349,1712511,2020-12-04 18:00:00,Cancha 1,APROBADA,PRUEBA,LUZ
350,1712222,2020-12-04 17:00:00,Cancha 1,APROBADA,E-08,SOL
351,1687608,2020-12-02 19:00:00,Cancha 2,APROBADA,H-07,LUZ
352,1687243,2020-12-02 18:00:00,Cancha 1,APROBADA,M-03,LUZ


In [66]:
saprobada = df['mza-lote']

In [67]:
caprobada = pd.Categorical(saprobada, categories=["APROBADA", "NO APROBADA"], ordered=False)

In [68]:
df['mza-lote'] = caprobada

In [69]:
df

Unnamed: 0,reserva,fechayhora,servicio,mza-lote,casa,iluminacion
0,8219516,2023-07-08 16:00:00,Cancha 2,APROBADA,M-03,SOL
1,8218607,2023-07-08 16:00:00,Cancha 1,APROBADA,F-04,SOL
2,8217992,2023-07-08 13:00:00,Cancha 1,APROBADA,G-12,SOL
3,8212103,2023-07-07 11:00:00,Cancha 1,APROBADA,A-52,SOL
4,8196830,2023-07-04 16:00:00,Cancha 1,APROBADA,K-04,SOL
...,...,...,...,...,...,...
349,1712511,2020-12-04 18:00:00,Cancha 1,APROBADA,PRUEBA,LUZ
350,1712222,2020-12-04 17:00:00,Cancha 1,APROBADA,E-08,SOL
351,1687608,2020-12-02 19:00:00,Cancha 2,APROBADA,H-07,LUZ
352,1687243,2020-12-02 18:00:00,Cancha 1,APROBADA,M-03,LUZ


In [70]:
# Reordering coluns: Inster ilumina column to replace inuminacion column between fechayhora and servicio
# df.reindex(columns = ['reserva','fechayhora','iluminacion','mza-lote','casa']) # Ojo: esto no anda bien reserva queda toda en NaN!

df.insert(2, 'ilumina', df['iluminacion'], True)

In [71]:
df

Unnamed: 0,reserva,fechayhora,ilumina,servicio,mza-lote,casa,iluminacion
0,8219516,2023-07-08 16:00:00,SOL,Cancha 2,APROBADA,M-03,SOL
1,8218607,2023-07-08 16:00:00,SOL,Cancha 1,APROBADA,F-04,SOL
2,8217992,2023-07-08 13:00:00,SOL,Cancha 1,APROBADA,G-12,SOL
3,8212103,2023-07-07 11:00:00,SOL,Cancha 1,APROBADA,A-52,SOL
4,8196830,2023-07-04 16:00:00,SOL,Cancha 1,APROBADA,K-04,SOL
...,...,...,...,...,...,...,...
349,1712511,2020-12-04 18:00:00,LUZ,Cancha 1,APROBADA,PRUEBA,LUZ
350,1712222,2020-12-04 17:00:00,SOL,Cancha 1,APROBADA,E-08,SOL
351,1687608,2020-12-02 19:00:00,LUZ,Cancha 2,APROBADA,H-07,LUZ
352,1687243,2020-12-02 18:00:00,LUZ,Cancha 1,APROBADA,M-03,LUZ


In [72]:
# Reordering columns: Drop the Iluminacion column
df = df.drop('iluminacion', axis=1)

In [73]:
df

Unnamed: 0,reserva,fechayhora,ilumina,servicio,mza-lote,casa
0,8219516,2023-07-08 16:00:00,SOL,Cancha 2,APROBADA,M-03
1,8218607,2023-07-08 16:00:00,SOL,Cancha 1,APROBADA,F-04
2,8217992,2023-07-08 13:00:00,SOL,Cancha 1,APROBADA,G-12
3,8212103,2023-07-07 11:00:00,SOL,Cancha 1,APROBADA,A-52
4,8196830,2023-07-04 16:00:00,SOL,Cancha 1,APROBADA,K-04
...,...,...,...,...,...,...
349,1712511,2020-12-04 18:00:00,LUZ,Cancha 1,APROBADA,PRUEBA
350,1712222,2020-12-04 17:00:00,SOL,Cancha 1,APROBADA,E-08
351,1687608,2020-12-02 19:00:00,LUZ,Cancha 2,APROBADA,H-07
352,1687243,2020-12-02 18:00:00,LUZ,Cancha 1,APROBADA,M-03


<h4>Reorder df by date and time</h4>

In [74]:
df.sort_values(by = 'fechayhora', inplace = True)

In [75]:
df

Unnamed: 0,reserva,fechayhora,ilumina,servicio,mza-lote,casa
353,1652644,2020-11-26 16:00:00,SOL,Cancha 1,APROBADA,N-09
352,1687243,2020-12-02 18:00:00,LUZ,Cancha 1,APROBADA,M-03
351,1687608,2020-12-02 19:00:00,LUZ,Cancha 2,APROBADA,H-07
350,1712222,2020-12-04 17:00:00,SOL,Cancha 1,APROBADA,E-08
349,1712511,2020-12-04 18:00:00,LUZ,Cancha 1,APROBADA,PRUEBA
...,...,...,...,...,...,...
4,8196830,2023-07-04 16:00:00,SOL,Cancha 1,APROBADA,K-04
3,8212103,2023-07-07 11:00:00,SOL,Cancha 1,APROBADA,A-52
2,8217992,2023-07-08 13:00:00,SOL,Cancha 1,APROBADA,G-12
1,8218607,2023-07-08 16:00:00,SOL,Cancha 1,APROBADA,F-04


# Adding users column

In [76]:
# Generate a random Series with 354 elements, where each element is either "Propietario" or "Invitado"
choices = ["Propietario", "Invitado"]
random_data = np.random.choice(choices, size=354, replace=True)

# Create a pandas Series from the random data
random_series = pd.Series(random_data)

# Create a column with de serie
df['usuario'] = random_series

In [77]:
df

Unnamed: 0,reserva,fechayhora,ilumina,servicio,mza-lote,casa,usuario
353,1652644,2020-11-26 16:00:00,SOL,Cancha 1,APROBADA,N-09,Invitado
352,1687243,2020-12-02 18:00:00,LUZ,Cancha 1,APROBADA,M-03,Propietario
351,1687608,2020-12-02 19:00:00,LUZ,Cancha 2,APROBADA,H-07,Invitado
350,1712222,2020-12-04 17:00:00,SOL,Cancha 1,APROBADA,E-08,Propietario
349,1712511,2020-12-04 18:00:00,LUZ,Cancha 1,APROBADA,PRUEBA,Propietario
...,...,...,...,...,...,...,...
4,8196830,2023-07-04 16:00:00,SOL,Cancha 1,APROBADA,K-04,Propietario
3,8212103,2023-07-07 11:00:00,SOL,Cancha 1,APROBADA,A-52,Invitado
2,8217992,2023-07-08 13:00:00,SOL,Cancha 1,APROBADA,G-12,Invitado
1,8218607,2023-07-08 16:00:00,SOL,Cancha 1,APROBADA,F-04,Invitado


In [78]:
df['usuario'] = pd.Categorical(df['usuario'], categories=["Propietario", "Invitado"], ordered=False)

# Append new column for cancha's princing: Create price billed

In [79]:
df['precio'] = 0.0

In [80]:
df.dtypes

reserva                int64
fechayhora    datetime64[ns]
ilumina             category
servicio            category
mza-lote            category
casa                  object
usuario             category
precio               float64
dtype: object

# Pricing Table

In [81]:
# Lee el archivo excel con pandas y guardar el resultado en un dataframe
dfp = pd.read_excel("PreciosCanchasTenis2.0.xlsx")

In [82]:
dfp

Unnamed: 0,Fecha,Usuario,Ilumina,Precio
0,2021-01-01,Propietario,SOL,100
1,2021-01-01,Propietario,LUZ,200
2,2021-01-01,Invitado,SOL,100
3,2021-01-01,Invitado,LUZ,200
4,2023-02-01,Propietario,SOL,150
5,2023-02-01,Propietario,LUZ,250
6,2023-02-01,Invitado,SOL,250
7,2023-02-01,Invitado,LUZ,400
8,2023-07-01,Propietario,SOL,200
9,2023-07-01,Propietario,LUZ,300


In [83]:
dfp.dtypes

Fecha      datetime64[ns]
Usuario            object
Ilumina            object
Precio              int64
dtype: object

In [84]:
dfp['Precio'] = dfp['Precio'].astype(float)
dfp['Usuario'] = pd.Categorical(dfp['Usuario'], categories=["Propietario", "Invitado"], ordered=False)
dfp['Ilumina'] = pd.Categorical(dfp['Ilumina'], categories=["SOL","LUZ"], ordered=False)

In [85]:
dfp.dtypes

Fecha      datetime64[ns]
Usuario          category
Ilumina          category
Precio            float64
dtype: object

In [86]:
dfp

Unnamed: 0,Fecha,Usuario,Ilumina,Precio
0,2021-01-01,Propietario,SOL,100.0
1,2021-01-01,Propietario,LUZ,200.0
2,2021-01-01,Invitado,SOL,100.0
3,2021-01-01,Invitado,LUZ,200.0
4,2023-02-01,Propietario,SOL,150.0
5,2023-02-01,Propietario,LUZ,250.0
6,2023-02-01,Invitado,SOL,250.0
7,2023-02-01,Invitado,LUZ,400.0
8,2023-07-01,Propietario,SOL,200.0
9,2023-07-01,Propietario,LUZ,300.0


### DataFrames Inconcistency - Dates Rows

The use of courtyards starts on year 2020 while pricing references information starts on year 2020.
El uso de las canchas comienza en el año 2020 mientras que la información de los precios inicia en el año 2021.

Insert a new rows to start pricing from 2020-01-01.
Insertar nuevas filas para iniciar con los precios 01-01-2020.

In [87]:
# Insert 4 rows - Prepare new 4 rows to add
data = {
    'Fecha': ['2020-01-01', '2020-01-01', '2020-01-01', '2020-01-01'],
    'Usuario': ['Propietario', 'Propietario', 'Invitado', 'Invitado'],
    'Ilumina': ['SOL', 'LUZ', 'SOL', 'LUZ'],
    'Precio': [90, 190, 90, 190]
}

dfp2add = pd.DataFrame(data)

In [88]:
# it's necesary cast the columns
dfp2add['Fecha']= pd.to_datetime(dfp2add['Fecha'])
dfp2add['Usuario']= dfp2add['Usuario'].astype('category')
dfp2add['Ilumina']= dfp2add['Ilumina'].astype('category')
dfp2add['Precio']=dfp2add['Precio'].astype(float)

In [89]:
dfp2add

Unnamed: 0,Fecha,Usuario,Ilumina,Precio
0,2020-01-01,Propietario,SOL,90.0
1,2020-01-01,Propietario,LUZ,190.0
2,2020-01-01,Invitado,SOL,90.0
3,2020-01-01,Invitado,LUZ,190.0


In [90]:
# Insert 4 rows - Add new rows at the begining
frames = [dfp2add, dfp]      # build a list of dataframes to concat in order
result = pd.concat(frames)

In [91]:
result.dtypes

Fecha      datetime64[ns]
Usuario          category
Ilumina          category
Precio            float64
dtype: object

In [92]:
dfp = result

### Canchas Inconcistency - User Category Not Fownd

#### El archivo de canchas no especifica si los usuarios son propietarios o invitados

In [93]:
# Generate a random Series with 354 elements, where each element is either "Propietario" or "Invitado"
choices = ["Propietario", "Invitado"]
random_data = np.random.choice(choices, size=354, replace=True)

# Create a pandas Series from the random data
random_series = pd.Series(random_data)

In [94]:
random_series

0         Invitado
1      Propietario
2         Invitado
3         Invitado
4         Invitado
          ...     
349    Propietario
350    Propietario
351       Invitado
352    Propietario
353    Propietario
Length: 354, dtype: object

In [95]:
df['usuario'] = random_series

In [96]:
df

Unnamed: 0,reserva,fechayhora,ilumina,servicio,mza-lote,casa,usuario,precio
353,1652644,2020-11-26 16:00:00,SOL,Cancha 1,APROBADA,N-09,Propietario,0.0
352,1687243,2020-12-02 18:00:00,LUZ,Cancha 1,APROBADA,M-03,Propietario,0.0
351,1687608,2020-12-02 19:00:00,LUZ,Cancha 2,APROBADA,H-07,Invitado,0.0
350,1712222,2020-12-04 17:00:00,SOL,Cancha 1,APROBADA,E-08,Propietario,0.0
349,1712511,2020-12-04 18:00:00,LUZ,Cancha 1,APROBADA,PRUEBA,Propietario,0.0
...,...,...,...,...,...,...,...,...
4,8196830,2023-07-04 16:00:00,SOL,Cancha 1,APROBADA,K-04,Invitado,0.0
3,8212103,2023-07-07 11:00:00,SOL,Cancha 1,APROBADA,A-52,Invitado,0.0
2,8217992,2023-07-08 13:00:00,SOL,Cancha 1,APROBADA,G-12,Invitado,0.0
1,8218607,2023-07-08 16:00:00,SOL,Cancha 1,APROBADA,F-04,Propietario,0.0


In [97]:
df.columns

Index(['reserva ', 'fechayhora', 'ilumina', 'servicio', 'mza-lote', 'casa',
       'usuario', 'precio'],
      dtype='object')

In [121]:
import pandas as pd

# Convert dates column in df and dfp to datetime if it's not already in datetime format
df['fechayhora'] = pd.to_datetime(df['fechayhora'])
df['usuario'] = pd.Categorical(df['usuario'], categories=["Propietario", "Invitado"], ordered=False)
df['ilumina'] = pd.Categorical(df['ilumina'], categories=["SOL","LUZ"], ordered=False)
dfp['Fecha'] = pd.to_datetime(dfp['Fecha'])
dfp['Usuario'] = pd.Categorical(dfp['Usuario'], categories=["Propietario", "Invitado"], ordered=False)
dfp['Ilumina'] = pd.Categorical(dfp['Ilumina'], categories=["SOL","LUZ"], ordered=False)

# Convert categorical columns to integers using cat.codes
df['usuario'] = df['usuario'].cat.codes
df['ilumina'] = df['ilumina'].cat.codes
dfp['Usuario'] = dfp['Usuario'].cat.codes
dfp['Ilumina'] = dfp['Ilumina'].cat.codes

# Merge the two dataframes based on the nearest date, user, and illuminate
merged_df = pd.merge_asof(df, dfp, left_on='fechayhora', right_on='Fecha', left_by=['usuario', 'ilumina'], right_by=['Usuario', 'Ilumina'])

# Set the 'precio' value in df based on the merged_df
df['precio'] = merged_df['Precio']

# Fill any missing values in 'precio' column with 0.0
df['precio'].fillna(0.0, inplace=True)

# this has one bug. the row 353 is not procesed and remains in 0

In [114]:
df.head(100)



# pd.merge(df, dfp, left_on = ['fechayhora','usuario', 'ilumina'], right_on = ['Fecha', 'Usuario', 'Ilumina'], how = 'left')

Unnamed: 0,reserva,fechayhora,ilumina,servicio,mza-lote,casa,usuario,precio
353,1652644,2020-11-26 16:00:00,-1,Cancha 1,APROBADA,N-09,-1,0.0
352,1687243,2020-12-02 18:00:00,-1,Cancha 1,APROBADA,M-03,-1,500.0
351,1687608,2020-12-02 19:00:00,-1,Cancha 2,APROBADA,H-07,-1,500.0
350,1712222,2020-12-04 17:00:00,-1,Cancha 1,APROBADA,E-08,-1,500.0
349,1712511,2020-12-04 18:00:00,-1,Cancha 1,APROBADA,PRUEBA,-1,500.0
...,...,...,...,...,...,...,...,...
257,3801884,2021-07-17 16:00:00,-1,Cancha 1,APROBADA,H-07,-1,400.0
256,3805788,2021-07-18 17:00:00,-1,Cancha 1,APROBADA,K-04,-1,400.0
255,3835404,2021-07-23 17:00:00,-1,Cancha 1,APROBADA,L-11,-1,400.0
254,3841056,2021-07-23 18:00:00,-1,Cancha 2,APROBADA,G-07,-1,400.0


In [117]:
df_filtered = df[df['precio']==0]

In [118]:
df_filtered

Unnamed: 0,reserva,fechayhora,ilumina,servicio,mza-lote,casa,usuario,precio
353,1652644,2020-11-26 16:00:00,-1,Cancha 1,APROBADA,N-09,-1,0.0


In [120]:
matching_rows_dfp = dfp[
    (dfp['Fecha'] == '2020-11-26 16:00:00') & (dfp['Usuario'] == -1) & (dfp['Ilumina'] == 'Cancha 1')
]
print(matching_rows_dfp)


Empty DataFrame
Columns: [Fecha, Usuario, Ilumina, Precio]
Index: []


In [119]:
dfp

Unnamed: 0,Fecha,Usuario,Ilumina,Precio
0,2020-01-01,-1,-1,90.0
1,2020-01-01,-1,-1,190.0
2,2020-01-01,-1,-1,90.0
3,2020-01-01,-1,-1,190.0
0,2021-01-01,-1,-1,100.0
1,2021-01-01,-1,-1,200.0
2,2021-01-01,-1,-1,100.0
3,2021-01-01,-1,-1,200.0
4,2023-02-01,-1,-1,150.0
5,2023-02-01,-1,-1,250.0


In [None]:
# Convert dates column in df and dfp to datetime if it's not already in datetime format
df['fechayhora'] = pd.to_datetime(df['fechayhora'])
dfp['Fecha'] = pd.to_datetime(dfp['Fecha'])

# Sort dfp by date to ensure consecutive dates are next to each other
dfp = dfp.sort_values(by='Fecha')

# Create a Dataframe with uniques start and end datas of dfp
serie_unique = dfp['Fecha'].unique()                       # gest the array of unique dates
dfp_u = pd.DataFrame({
        'FecIni' : serie_unique, 'FecFin': serie_unique    # initially both dfp_u columns has the samedates
})

# Configure end dates with shift and append on the las row with last rown from the unique date.
dfp_u['FecFin'] = dfp_u['FecFin'].shift(-1)   # shit values up             
n = len(dfp_u)-1
dfp_u['FecFin'][n] = pd.Timestamp.today()  # complete the last value


# Iterate over rows in df DataFrame
i=0
for df_row in df.itertuples():
    
    # Get the current date, user, and illuminate from the row
    date = getattr(df_row,'fechayhora')
    user = getattr(df_row, 'usuario')
    illuminate = getattr(df_row,'ilumina')
    i=i+1
    # debug: print("df fila:", getattr(row, 'Index'))
    
    # Initialize the precio to 0.0 for the current row
    precio = 0.0
    
    # Iterate over consecutive pairs of dates in dfp
    for dfp_row in dfp_u.itertuples():
        start_date = getattr(dfp_row, 'FecIni')
        end_date = getattr(dfp_row, 'FecFin')
        
        # Check if the current date falls between the start and end dates
        if start_date <= date <= end_date :
            # debug: print("dfp index =", getattr(row, 'Index'), "Si! ", start_date, " <= ", date, " < ",  end_date)
            
            # Filter dfp based on the conditions for merging
            dfp_filtered = dfp[
                (dfp['Fecha'] == start_date) &
                (dfp['Usuario'] == user) &
                (dfp['Ilumina'] == illuminate)
            ]

            # Set the 'precio' value in df based on the filtered dfp value
            if not dfp_filtered.empty:
                df.at[getattr(df_row, 'Index'), 'precio'] = dfp_filtered.iloc[0]['Precio']
                print(i, df.at[getattr(df_row, 'Index'), 'precio'])
            else:
                print("no hay precio", df_row, Index)

In [None]:
df

In [None]:
df.tail()