## Licitación
### 1. Duplicados: Expedientes con más de un status

In [6]:
import pandas as pd
import time

#### Leyendo el CSV de licitación y convirtiéndolo a parquet para manejar un formato más eficiente

In [12]:
start_time = time.time()

# Read CSV and save to Parquet
licitacion_df = pd.read_csv('../data/Processed/licitacion_sesna_data.csv')
licitacion_df.to_parquet('../data/Processed/parquet_files/licitacion_data.parquet')

elapsed_time = time.time() - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")

Tiempo de ejecución: 35.86907386779785 segundos


#### Leyendo el archivo parquet de licitación y mostrando los primeros rows

In [13]:
start_time = time.time()

# Read from Parquet (faster and uses less memory)
licitacion_df = pd.read_parquet('../data/Processed/parquet_files/licitacion_data.parquet')

elapsed_time = time.time() - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")
licitacion_df.head()

Tiempo de ejecución: 7.7934160232543945 segundos


Unnamed: 0,cve_expediente,procurementMethod,procurementMethod_rationale,status,title,description,has_enquiries,number_tenderers,tender_start_date,tender_end_date,...,enquiry_end_date,procuring_entity_id,procuring_entity_name,value_currency_tender,value_amount_tender,award_criteria,framework_agreement,framework_agreement_platform,framework_agreement_title,submission_method
0,1892834,direct,Art. 41 fr. V,complete,SERVICIOS MEDICOS SUBROGADOS DE ESPECIALIZACIO...,SERVICIOS MEDICOS SUBROGADOS DE ESPECIALIZACIO...,False,0,2021-09-28T16:39:00Z,,...,,ISS6001015A3-051GYN085,ISSSTE-Hospital Regional Gral. Ignacio Zaragoz...,,,,,,,electronicSubmission
1,2010079,direct,Art. 41 fr. XIV,complete,TALLER CULTURAL DE YOGA PARA JUBILADOS Y PENSI...,TALLER CULTURAL DE YOGA PARA JUBILADOS Y PENSI...,False,0,2021-09-09T12:08:00Z,,...,,ISS6001015A3-051GYN035,"ISSSTE-Delegación Tabasco, Subdelegación de Ad...",,,,,,,inPerson
2,2039835,direct,Art. 41 fr. III,complete,SD_TGM_12122019 SUMINISTRO DE 301 MONEDEROS EL...,SD_TGM_12122019 SUMINISTRO DE 301 MONEDEROS EL...,False,0,2021-09-24T11:17:00Z,,...,,TGM990109718-004E2D001,TGM-Gerencia de Abastecimientos #004E2D001,,,,,,,electronicSubmission
3,2058673,direct,Art. 41 fr. II,complete,SERVICIO DE LIMPIEZA,Contrato celebrado para brindar el servicio de...,False,0,2021-09-03T10:46:00Z,,...,,RAN920810MU6-015B00973,RAN-Delegación Chihuahua #015B00973,,,,,,,inPerson
4,2071481,direct,Art. 41 fr. XIV,complete,SERVICIOS PROFESIONALES,SERVICIOS PROFESIONALES,False,0,2021-09-02T13:49:00Z,,...,,INI080202BQ8-011MDE001,INIFED-Gerencia de Recuros Materiales y Servic...,,,,,,,inPerson


In [14]:
licitacion_df.shape

(2742267, 23)

In [15]:
licitacion_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2742267 entries, 0 to 2742266
Data columns (total 23 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   cve_expediente                int64  
 1   procurementMethod             object 
 2   procurementMethod_rationale   object 
 3   status                        object 
 4   title                         object 
 5   description                   object 
 6   has_enquiries                 bool   
 7   number_tenderers              int64  
 8   tender_start_date             object 
 9   tender_end_date               object 
 10  award_start_date              object 
 11  award_end_date                object 
 12  enquiry_start_date            object 
 13  enquiry_end_date              object 
 14  procuring_entity_id           object 
 15  procuring_entity_name         object 
 16  value_currency_tender         object 
 17  value_amount_tender           float64
 18  award_criteria        

#### Identificando expedientes con más de un status: 15,692 

In [21]:
# Display all distinct status values from the original DataFrame
distinct_statuses = licitacion_df['status'].unique()
print("Distinct Statuses Across DataFrame:")
print(distinct_statuses)

Distinct Statuses Across DataFrame:
['complete' 'unsuccessful' 'active']


In [19]:
# Aggregate the unique statuses for each cve_expediente
agg_df = licitacion_df.groupby('cve_expediente')['status'].agg(lambda x: list(set(x))).reset_index()

# Filter for cve_expediente with more than one distinct status
multiple_statuses_df = agg_df[agg_df['status'].apply(len) > 1]

# Display the cve_expediente with multiple statuses
print("CVE Expediente with Multiple Statuses:")
multiple_statuses_df

CVE Expediente with Multiple Statuses:


Unnamed: 0,cve_expediente,status
381494,2241434,"[complete, active]"
391256,2256332,"[complete, active]"
400330,2271214,"[complete, active]"
403818,2277768,"[unsuccessful, active]"
403887,2277905,"[complete, active]"
...,...,...
565740,2560063,"[complete, active]"
566174,2560969,"[complete, active]"
566205,2561016,"[unsuccessful, active]"
566311,2561237,"[unsuccessful, active]"


#### Limpiando duplicados por status

In [22]:
start_time = time.time()

# Define a custom sort key based on status priority
status_priority = {'complete': 1, 'unsuccessful': 2, 'active': 3}
licitacion_df['status_priority'] = licitacion_df['status'].map(status_priority)

# Sort the DataFrame by 'cve_expediente' and 'status_priority'
sorted_df = licitacion_df.sort_values(by=['cve_expediente', 'status_priority'])

# Drop duplicates, keeping the first entry (highest priority status) for each 'cve_expediente'
cleaned_df = sorted_df.drop_duplicates(subset='cve_expediente', keep='first')

# drop the 'status_priority' column as it's no longer needed
cleaned_df = cleaned_df.drop(columns=['status_priority'])

# Display the cleaned DataFrame
print(cleaned_df.shape)

elapsed_time = time.time() - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")

(567053, 23)
Tiempo de ejecución: 1.9247839450836182 segundos


#### Confirmando 0 expedientes con más de un status

In [24]:
# Aggregate the unique statuses for each cve_expediente
agg_df = cleaned_df.groupby('cve_expediente')['status'].agg(lambda x: list(set(x))).reset_index()

# Filter for cve_expediente with more than one distinct status
multiple_statuses_df = agg_df[agg_df['status'].apply(len) > 1]

# Display the cve_expediente with multiple statuses
print("CVE Expediente with Multiple Statuses:")
multiple_statuses_df

CVE Expediente with Multiple Statuses:


Unnamed: 0,cve_expediente,status
