## Licitación
### 1. Duplicados

In [1]:
import pandas as pd
import time

pd.set_option('display.max_columns', None)

#### Leyendo el CSV de licitación y convirtiéndolo a parquet para manejar un formato más eficiente

In [None]:
start_time = time.time()

# Read CSV and save to Parquet
licitacion_df = pd.read_csv('../../data/Processed/csv_files/licitacion_data.csv')
licitacion_df.to_parquet('../../data/Processed/parquet_files/licitacion_data.parquet')

elapsed_time = time.time() - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")

#### Leyendo el archivo parquet de licitación y mostrando los primeros rows

In [None]:
start_time = time.time()

# Read from Parquet (faster and uses less memory)
licitacion_df = pd.read_parquet('../../data/Processed/parquet_files/licitacion_data.parquet')

elapsed_time = time.time() - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")
licitacion_df.head()

In [None]:
licitacion_df.shape

In [None]:
licitacion_df.info()

#### Estadísticas descriptivas para todas las variables

In [None]:
licitacion_df = licitacion_df.astype({
    'cve_expediente': 'string'})

# Obtener estadísticas descriptivas para todas las variables
descriptive_stats = licitacion_df.describe(include='all').transpose()

# Mostrar las estadísticas descriptivas
descriptive_stats

#### ¿Cuántos expedientes se repiten "x" número de veces"?

In [None]:
# Step 1: Count the repetitions of each `cve_expediente`
repetition_counts = licitacion_df.groupby('cve_expediente').size().reset_index(name='repetition_count')
repetition_counts.sort_values(by='repetition_count', ascending=False)

In [None]:
# Step 2: Count the occurrences of these repetition counts
repetition_summary = repetition_counts['repetition_count'].value_counts().reset_index()
repetition_summary.columns = ['repetition_count', 'file_count']
repetition_summary.head()

In [None]:
# Step 3: Sort the summary DataFrame by `repetition_count`
repetition_summary = repetition_summary.sort_values(by='repetition_count').reset_index(drop=True)
repetition_summary.head(10)

# Ejemplo de lectura del dataframe:
# * 7776 expedientes se repiten 2 veces
# * 74 expedientes se repiten 3 veces... así sucesivamente

Mostrando los resultados en una gráfica (A partir de 2 repeticiones)

In [None]:
import matplotlib.pyplot as plt


# Exclude the first row of the DataFrame
repetition_summary_filtered = repetition_summary.iloc[1:]

# Plotting the horizontal bar graph
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
plt.barh(repetition_summary_filtered['repetition_count'], repetition_summary_filtered['file_count'], color='skyblue')
plt.xlabel('Numero de Expedientes')
plt.ylabel('Numero de Repeticiones')
plt.title('Numero de Expedientes por Conteo de Repeticiones')
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Annotating the bars with the count values
for index, value in enumerate(repetition_summary_filtered['file_count']):
    plt.text(value, repetition_summary_filtered['repetition_count'].iloc[index], str(value))

plt.show()

### 1.1 Expedientes con más de un award_end_date

#### Convirtiendo award_end_date a tipo datetime

In [None]:
# Assuming licitacion_df is your DataFrame and 'award_end_date' is the column with the ISO 8601 dates
licitacion_df['award_end_date'] = pd.to_datetime(licitacion_df['award_end_date'], errors='coerce')

# Check for any entries where dates could not be parsed
invalid_dates = licitacion_df[licitacion_df['award_end_date'].isna()]
if not invalid_dates.empty:
    print("Entries with invalid dates:")
    print(invalid_dates.shape)
else:
    print("All dates converted successfully.")

In [None]:
licitacion_df.info()

#### Priorizando la fecha más reciente para cada expediente
Si hay más de una fila con la fecha más reciente conservamos todas las filas con esa fecha para después priorizar por status.

In [None]:
# Sort the DataFrame by 'cve_expediente' and 'award_end_date' descending
sorted_df = licitacion_df.sort_values(by=['cve_expediente', 'award_end_date'], ascending=[True, False])

# Get the most recent date for each `cve_expediente`
most_recent_dates = sorted_df.groupby('cve_expediente')['award_end_date'].max().reset_index()

# Merge to get only the rows with the most recent `award_end_date`
merged_df = pd.merge(sorted_df, most_recent_dates, on=['cve_expediente', 'award_end_date'], how='inner')

# Optionally, check the shape of the DataFrame
print("DataFrame shape after keeping the most recent dates:", merged_df.shape)

# Display the first few rows of the cleaned DataFrame to verify
merged_df.head()

### 1.2 Expedientes con más de un status

#### Mostrando valores únicos de status

In [None]:
# Display all distinct status values from the original DataFrame
distinct_statuses = merged_df['status'].unique()
print("Distinct Statuses Across DataFrame:")
print(distinct_statuses)

#### Identificando expedientes con más de un status: 12,962 

In [None]:
# Aggregate the unique statuses for each cve_expediente
agg_df = merged_df.groupby('cve_expediente')['status'].agg(lambda x: list(set(x))).reset_index()

# Filter for cve_expediente with more than one distinct status
multiple_statuses_df = agg_df[agg_df['status'].apply(len) > 1]

# Display the cve_expediente with multiple statuses
print("CVE Expediente with Multiple Statuses:")
multiple_statuses_df.head(10)

#### Limpiando duplicados por status

In [None]:
start_time = time.time()

# Define a custom sort key based on status priority
status_priority = {'complete': 1, 'unsuccessful': 2, 'active': 3}
merged_df['status_priority'] = merged_df['status'].map(status_priority)

# Sort the DataFrame by 'cve_expediente' and 'status_priority'
sorted_df = merged_df.sort_values(by=['cve_expediente', 'status_priority'])

# Drop duplicates, keeping the first entry (highest priority status) for each 'cve_expediente'
cleaned_df = sorted_df.drop_duplicates(subset='cve_expediente', keep='first')

# drop the 'status_priority' column as it's no longer needed
cleaned_df = cleaned_df.drop(columns=['status_priority'])

# Display the cleaned DataFrame
print(cleaned_df.shape)

elapsed_time = time.time() - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")

#### Confirmando 0 expedientes con más de un status

In [None]:
# Aggregate the unique statuses for each cve_expediente
agg_df = cleaned_df.groupby('cve_expediente')['status'].agg(lambda x: list(set(x))).reset_index()

# Filter for cve_expediente with more than one distinct status
multiple_statuses_df = agg_df[agg_df['status'].apply(len) > 1]

# Display the cve_expediente with multiple statuses
print("CVE Expediente with Multiple Statuses:")
multiple_statuses_df

In [None]:
cleaned_df.to_csv('../../data/Processed/csv_files/licitacion_data.csv', index=False, encoding='utf-8')

print("DataFrame has been saved to 'licitacion_data.csv'")

## 2. Asignación

#### Leyendo el CSV de Asignación y convirtiéndolo a parquet para manejar un formato más eficiente

In [2]:
start_time = time.time()

# Read CSV and save to Parquet
asignacion_df = pd.read_csv('../../data/Processed/csv_files/asignacion_data.csv')
asignacion_df.to_parquet('../../data/Processed/parquet_files/asignacion_data.parquet')

elapsed_time = time.time() - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")

Tiempo de ejecución: 45.31891489028931 segundos


#### Leyendo el archivo parquet de Asignación y mostrando los primeros rows

In [3]:
start_time = time.time()

# Read from Parquet (faster and uses less memory)
asignacion_df = pd.read_parquet('../../data/Processed/parquet_files/asignacion_data.parquet')

elapsed_time = time.time() - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")
asignacion_df.head()

Tiempo de ejecución: 9.036490440368652 segundos


Unnamed: 0,cve_expediente,cve_contrato,status,description_award,title_award,contract_start_date,contract_end_date,value_amount,value_currency,suppliers_id,suppliers_name,docs_url_awards,docs_title_awards,docs_language_awards,docs_id_awards,docs_format_awards,docs_type_awards,docs_descr_awards,docs_date_published_awards
0,1892834,2038777,active,SERVICIOS MEDICOS SUBROGADOS DE ESPECIALIZACIO...,SERVICIOS MEDICOS SUBROGADOS DE ESPECIALIZACIO...,2019-03-19T00:00:00Z,2019-06-30T23:59:00Z,150000.0,MXN,04B3DC027ADD775B746959A20A80292A,"APLICACIONES INDUSTRIALES DE CALIDAD, S.A. DE ...",,,,,,,,
1,2010079,2186605,active,TALLER CULTURAL DE YOGA PARA JUBILADOS Y PENSI...,TALLER CULTURAL DE YOGA PARA JUBILADOS Y PENSI...,2019-09-01T00:00:00Z,2019-12-31T23:59:00Z,12068.8,MXN,5BECD127FEA6806B2A404D2B14A9CEEE,LEONOR HILDA BARRIOS SANCHEZ,,,,,,,,
2,2039835,2591875,active,SUMINISTRO DE 301 MONEDEROS ELECTRÓNICOS,SUMINISTRO DE 301 MONEDEROS ELECTRÓNICOS,2019-12-18T09:00:00Z,2019-12-20T12:59:00Z,3625000.0,MXN,TIN090211JC9,TOKA INTERNACIONAL S A P I DE CV,,,,,,,,
3,2058673,2248316,active,CONTRATO CELEBRADO PARA EL SERVICIO DE LIMPIEZ...,Servicio de Limpieza,2020-02-02T00:00:00Z,2020-12-31T23:59:00Z,251900.0,MXN,3CEDE48206129D952B3EA0B6EA3E093D,PATRICIA SELENE SOLIS GALLEGOS,,,,,,,,
4,2071481,2264071,active,SERVICIOS PROFESIONALES,SERVICIOS PROFESIONALES,2020-01-16T09:00:00Z,2020-04-30T18:00:00Z,64870.68,MXN,AAB6B3F29F29293964209B17E7D72DAC,LUIS ALBERTO ESPINOZA VAZQUEZ,,,,,,,,


In [None]:
asignacion_df.shape

In [None]:
asignacion_df.info()

### 2.1 Removiendo duplicados al priorizar la fecha más reciente de contract_start_date         

In [4]:
# Step 1: Convert 'contract_start_date' to datetime
asignacion_df['contract_start_date'] = pd.to_datetime(asignacion_df['contract_start_date'], errors='coerce')

# Step 2: Sort the DataFrame by 'cve_contrato' and 'contract_start_date' in descending order
asignacion_df = asignacion_df.sort_values(by=['cve_contrato', 'contract_start_date'], ascending=[True, False])

# Step 3: Drop duplicates based on 'cve_contrato', keeping the first occurrence
asignacion_df = asignacion_df.drop_duplicates(subset='cve_contrato', keep='first')

# Optionally, check the shape of the DataFrame
print("DataFrame shape after removing duplicates:", asignacion_df.shape)

# Display the first few rows of the cleaned DataFrame to verify
asignacion_df.head()

DataFrame shape after removing duplicates: (828542, 19)


Unnamed: 0,cve_expediente,cve_contrato,status,description_award,title_award,contract_start_date,contract_end_date,value_amount,value_currency,suppliers_id,suppliers_name,docs_url_awards,docs_title_awards,docs_language_awards,docs_id_awards,docs_format_awards,docs_type_awards,docs_descr_awards,docs_date_published_awards
591239,975480,964520,active,"CONTRATO PLURIANUAL 2015 ¿ 2018, BAJO LA MODAL...",SERVICIO INTEGRAL DE ARRENDAMIENTO DE EQUIPO D...,2015-06-01 00:00:00+00:00,2018-05-31T23:59:00Z,8474040.0,MXN,TPL030219RE3,TEC PLUSS SA DE CV,,,,,,,,
10457,1016499,1019413,active,"SERVICIO DE ATENCIÓN DE ASUNTOS LABORALES, REV...",SERVICIOS DE ASESORÍA JURÍDICA,2016-01-01 00:00:00+00:00,2016-12-31T23:59:00Z,2844000.0,MXN,VAB140128J79,VERNIS ABOGADOS SC,,,,,,,,
10455,1004872,1025677,active,MANTENIMIENTO PREVENTIVO Y CORRECTIVO DEL PARQ...,MANTENIMIENTO PREVENTIVO Y CORRECTIVO AL PARQU...,2016-02-29 17:01:00+00:00,2016-12-31T12:00:00Z,650000.02,MXN,ASI941004139,AUTOMOTRIZ Y SERVICIO INTEGRAL SA DE CV,,,,,,,,
10459,1036167,1043633,active,,"AGROASEMEX, S.A.",2016-03-01 00:00:00+00:00,2016-12-31T23:59:00Z,3159900.0,MXN,AGR900605VC6,"AGROASEMEX, S.A.",,,,,,,,
10464,1057507,1070559,active,"Servicios informáticos de adecuación, mantenim...","Servicios informáticos de adecuación, mantenim...",2017-01-01 09:00:00+00:00,2017-02-28T23:59:00Z,201116.6,MXN,CSB080408DB8,CONSULTORES Y SOLUCIONES BAJAWARE S DE RL DE CV,,,,,,,,


#### Save the cleaned DataFrame to a CSV file

In [5]:
asignacion_df.to_csv('../../data/Processed/csv_files/asignacion_data.csv', index=False, encoding='utf-8')

print("DataFrame has been saved to 'asignacion_data.csv'")

DataFrame has been saved to 'asignacion_data.csv'


### 3. Documentos Tender
### Duplicados por docs_date_published

#### Leyendo el CSV de licitación y convirtiéndolo a parquet para manejar un formato más eficiente

In [6]:
# Read CSV and save to Parquet
documentos_df = pd.read_csv('../../data/Processed/csv_files/documentos_tender_sesna_data.csv')
documentos_df.to_parquet('../../data/Processed/parquet_files/documentos_tender_sesna_data.parquet')

elapsed_time = time.time() - start_time
print(f"Tiempo de ejecución: {elapsed_time} segundos")

Tiempo de ejecución: 76.13503742218018 segundos


In [7]:
documentos_df = pd.read_parquet('../../data/Processed/parquet_files/documentos_tender_sesna_data.parquet')

In [None]:
documentos_df.shape

In [None]:
documentos_df.info()

#### Limpiando duplicados al priorizar la primera fecha (fecha más antigua) de docs_date_published_tender

In [8]:
# Asegurarte de que la columna de fecha esté en el formato datetime
documentos_df['docs_date_published_tender'] = pd.to_datetime(documentos_df['docs_date_published_tender'])

# Ordenar el DataFrame por la fecha de publicación en orden ascendente
documentos_df = documentos_df.sort_values(by='docs_date_published_tender', ascending=True)

# Eliminar duplicados y mantener el primer registro de cada grupo basado en 'cve_expediente'
documentos_df = documentos_df.drop_duplicates(subset='cve_expediente', keep='first')

# Verificar el resultado
print(documentos_df.shape)
documentos_df.head()

(91899, 9)


Unnamed: 0,cve_expediente,docs_title_tender,docs_type_tender,docs_language_tender,docs_date_published_tender,docs_id_tender,docs_format_tender,docs_description_tender,docs_url_tender
3198,1189249,Conv-IO-009000960-E43-2016.doc,tenderNotice,es,2016-10-25 19:52:00+00:00,29197598,text/html,Convocatoria / Invitación,https://compranet.hacienda.gob.mx/esop/guest/g...
3200,1189263,Conv-IO-009000960-E44-2016.doc,tenderNotice,es,2016-10-25 20:35:00+00:00,29198348,text/html,Convocatoria / Invitación,https://compranet.hacienda.gob.mx/esop/guest/g...
3266,1246041,2017.docx,tenderNotice,es,2017-01-02 15:35:00+00:00,30046494,text/html,Convocatoria / Invitación,https://compranet.hacienda.gob.mx/esop/guest/g...
3268,1246227,2017.docx,tenderNotice,es,2017-01-02 18:24:00+00:00,30047451,text/html,Convocatoria / Invitación,https://compranet.hacienda.gob.mx/esop/guest/g...
3263,1242820,Convocatoria LPN GASES MEDICINALES.docx,tenderNotice,es,2017-01-03 09:07:00+00:00,30049859,text/html,Convocatoria / Invitación,https://compranet.hacienda.gob.mx/esop/guest/g...


#### Limpiando duplicados al priorizar la primera fecha (fecha más antigua) de docs_date_published_tender

In [9]:
documentos_df.to_csv('../../data/Processed/csv_files/documentos_tender_sesna_data.csv', index=False, encoding='utf-8')

print("DataFrame has been saved to 'documentos_tender_sesna_data.csv'")

DataFrame has been saved to 'documentos_tender_sesna_data.csv'


In [10]:
import os
def convert_csv_to_parquet(input_folder, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            # Construct full file path
            csv_file = os.path.join(input_folder, filename)
            
            # Read the CSV file
            df = pd.read_csv(csv_file)
            
            # Construct the output file path
            parquet_file = os.path.join(output_folder, filename.replace('.csv', '.parquet'))
            
            # Save the dataframe as a parquet file
            df.to_parquet(parquet_file)
            
            print(f"Converted {csv_file} to {parquet_file}")

# Specify the input and output folders
input_folder = '../../data/Processed/csv_files/'
output_folder = '../../data/Processed/parquet_files/'

# Convert all CSV files in the input folder to Parquet format
convert_csv_to_parquet(input_folder, output_folder)


Converted ../../data/Processed/csv_files/asignacion_data.csv to ../../data/Processed/parquet_files/asignacion_data.parquet
Converted ../../data/Processed/csv_files/comprador_sesna_data.csv to ../../data/Processed/parquet_files/comprador_sesna_data.parquet
Converted ../../data/Processed/csv_files/documentos_tender_sesna_data.csv to ../../data/Processed/parquet_files/documentos_tender_sesna_data.parquet
Converted ../../data/Processed/csv_files/items_adq_sesna_data.csv to ../../data/Processed/parquet_files/items_adq_sesna_data.parquet
Converted ../../data/Processed/csv_files/licitacion_data.csv to ../../data/Processed/parquet_files/licitacion_data.parquet
Converted ../../data/Processed/csv_files/participantes_proveedores.csv to ../../data/Processed/parquet_files/participantes_proveedores.parquet
Converted ../../data/Processed/csv_files/tender_items_sesna_data.csv to ../../data/Processed/parquet_files/tender_items_sesna_data.parquet
