In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from openpyxl import load_workbook
from datetime import datetime

In [None]:
df = pd.read_excel(
    "inputs/Encuesta_para_inmigrantes_venezolanos_en_Ica_-_all_versions_-_labels_-_2023-02-18-20-40-30.xlsx",
    parse_dates=[1, 85],
)

In [None]:
df.shape

In [None]:
def read_filtered_excel(file, sheet_number=0):
    """
    Import filtered excel table into Python

    Source: https://stackoverflow.com/questions/46002159/how-to-import-filtered-excel-table-into-python
    """
    wb = load_workbook(file)
    ws = wb[wb.sheetnames[sheet_number]]

    # iterate over all the rows in the sheet
    rows = []
    for row in ws:
        # use the row only if it has not been filtered out (i.e., it's not hidden)
        if ws.row_dimensions[row[0].row].hidden == False:
            rows.append([cell.value for cell in row])

    # Handle duplicated column names
    colnames = pd.Series(rows[0])
    duplicated_colnames = colnames[colnames.duplicated()].sort_values()
    for col in duplicated_colnames:
        for i, (ix, dup_col) in enumerate(colnames[colnames == col].items()):
            if i > 0:
                colnames.loc[ix] = f"{dup_col}.{i}"

    return pd.DataFrame(rows[1:], columns=colnames.values)

In [None]:
# get all xlsx files
files = list(Path("inputs/EXCEL CORREGIDOS").rglob("*.xlsx"))

In [None]:
# read only filtered rows
fixed_rows = [read_filtered_excel(file) for file in files]

In [None]:
# concatenate rows
fixed_rows_df = pd.concat((fixed_rows), ignore_index=True)

In [None]:
fixed_rows_df.shape

In [None]:
# parse dates
fixed_rows_df["Fecha de la entrevista"] = pd.to_datetime(
    fixed_rows_df["Fecha de la entrevista"]
)
fixed_rows_df["_submission_time"] = pd.to_datetime(fixed_rows_df["_submission_time"])

In [None]:
# parse others dtypes
fixed_rows_df = fixed_rows_df.astype(df.dtypes.to_dict())

In [None]:
# Verify _index integrity
fixed_rows_df["_id"].duplicated().sum()

In [None]:
# Keep only first modification (first day row was modified)
fixed_rows_df = fixed_rows_df.drop_duplicates(subset="_id", keep="first")

In [None]:
fixed_rows_df["new_id"] = fixed_rows_df["Nombre del encuestador"].fillna(
    ""
) + fixed_rows_df["_id"].astype(str)

In [None]:
fixed_rows_df.shape, fixed_rows_df["new_id"].unique().shape

In [None]:
df["new_id"] = df["Nombre del encuestador"].fillna("") + df["_id"].astype(str)

In [None]:
df.shape, df["new_id"].unique().shape

In [None]:
# Remove duplicated according to main df
fixed_rows_wo_dups = fixed_rows_df[fixed_rows_df["new_id"].isin(df["new_id"].unique())]

In [None]:
# Verify _id integrity
fixed_rows_wo_dups.duplicated("new_id").sum()

In [None]:
fixed_rows_wo_dups.shape

In [None]:
# Verify _id integrity of main df
df["_id"].duplicated().sum()

In [None]:
# Check all fixed _index are present in main df
fixed_rows_df["new_id"].isin(df["new_id"].values).sum()

In [None]:
# Remove fixed rows from main df
df_kobo = df[~df["new_id"].isin(fixed_rows_df["new_id"])]

In [None]:
# Verify number of obs before 12/02
(df_kobo["Fecha de la entrevista"] < datetime(2023, 2, 12)).sum()

Hay 5 observaciones mas en el archivo principal que en los modificados.

In [None]:
df_kobo.shape, fixed_rows_df.shape

In [None]:
df_kobo["new_id"].duplicated().sum(), fixed_rows_df["new_id"].duplicated().sum()

In [None]:
complete_df = pd.concat((df_kobo, fixed_rows_df), ignore_index=True)

In [None]:
complete_df["new_id"].duplicated().sum()

In [None]:
(complete_df["Fecha de la entrevista"] < datetime(2023, 2, 12)).sum()

In [None]:
complete_df.to_excel("outputs/complete_df.xlsx", index=False)