# <u>MAIN CODE</u>

In [None]:
import time
import os
import pandas as pd
import numpy as np
import re

# declares timer, locks absolute_path to working directory, declares file URLs and list of df
before = time.perf_counter()
absolute_path = os.getcwd()
urls = ["http://www.creciendoconderechos.gob.cl/docs/Rendimiento_Escolar_Basica.xlsx", "http://www.creciendoconderechos.gob.cl/docs/Rendimiento_Escolar_Media.xlsx"]
df_list = []

# creates data_temp folder and changes working directory
def change_dir(path):
    if not os.path.isdir(path):
        os.mkdir(path)    
    os.chdir(path)

change_dir("data_temp")

In [None]:
# generates header row
hrow = ["region_code", "region_name", "province_code", "province_name", "commune_code", "commune_name"]
for yr in range(2010,2018):
    for name in ["total_", "prom_num_", "prom_perc_", "rep_num_", "rep_perc_", "drop_num_", "drop_perc_"]:
        hrow.append(name + str(yr))

# processing
for url in urls:
    print("{:.2f} s | Opening {}".format(time.perf_counter()-before, url))
    df = pd.read_excel(url, header = None, sheet_name = "Información Base Comunal", skiprows = list(range(5)))
    df.columns = hrow

    # selects necessary columns
    sel_cols = ["commune_code"]
    for yr in range(2010,2018):
        for name in ["prom_num_", "rep_num_", "drop_num_"]:
            sel_cols.append(name + str(yr))

    df = df[sel_cols]
    print("{:.2f} s | Columns selected.".format(time.perf_counter()-before))

    # melts columns to make dataframe tidy
    melt_cols = [col for col in df.columns if col != "commune_code"]
    df = pd.melt(df, id_vars = "commune_code", value_vars = melt_cols, var_name = "status_year", value_name = "total")
    print("{:.2f} s | Melted columns.".format(time.perf_counter()-before))
    
    # creates year column
    def get_year(row, col):
        target = row[col]
        reg = re.search("\d", target)
        first = reg.start()
        y = target[first : first + 4]
        return y

    df["year"] = df.apply(get_year, col = "status_year", axis = 1)
    print("{:.2f} s | Year column created.".format(time.perf_counter()-before))
    
    # creates status column
    def get_status(row, col):
        stat = {"prom": 1, "rep": 2, "drop": 3}
        return next((stat[k] for k in stat.keys() if k in row[col]), np.nan)

    df["promotion_id"] = df.apply(get_status, col = "status_year", axis = 1)
    print("{:.2f} s | Status column created.".format(time.perf_counter()-before))

    # drops status_year column and NaN rows on commune_id
    df = df[[c for c in df.columns if c != "status_year"]]
    df = df.dropna(subset = ["commune_code"])
    print("{:.2f} s | Dropped extra columns and NaN values.".format(time.perf_counter()-before))

    # creates education column
    ed = re.search("_(.+?)_(.+?).xlsx", url)
    ed = ed.group(2)
    df["education_id"] = pd.Series([ed] * len(df.index)).values
    
    def get_education(row, col):
        ed_dict = {"Basica": 1, "Media": 2}
        return ed_dict[row[col]]
    
    df["education_id"] = df.apply(get_education, col = "education_id", axis = 1)
    print("{:.2f} s | Education column created.".format(time.perf_counter()-before))

    df_list.append(df)
    print("{:.2f} s | Appended dataframe.".format(time.perf_counter()-before))

In [None]:
# concatenates each year's dataframe
df = pd.concat(df_list, ignore_index=True)
print("{:.2f} s | Concatenated each dataframe.".format(time.perf_counter()-before))

# writes datachile official IDs for each commune and drops unnecesary columns
df_ids = pd.read_csv("https://raw.githubusercontent.com/datachile/datachile-etl/master/official_ids/2017_06_27_comunas_datachile_fixed.csv")
df = pd.merge(df, df_ids, left_on = "commune_code", right_on = "comuna_customs_id")
df = df[["comuna_datachile_id", "year", "education_id", "promotion_id", "total"]]
df = df.rename(columns = {"comuna_datachile_id": "comuna_id"})

# fills NaN values in total column
df["total"] = df["total"].replace("-",0)

# converts all columns to integer type
df[df.columns] = df[df.columns].apply(pd.to_numeric, downcast = "integer")

# comes back to original path, creates data_final folder and exports as csv
os.chdir(absolute_path)
change_dir("data_final")
df.to_csv("mds_abandonment_rate.csv", index = False)
print("{:.2f} s | Exported CSV file.".format(time.perf_counter()-before))

# creates CSV with promotion IDs
stat_tb = {"promotion_id": list(range(1,4)), "name_es": ["Aprobados", "Reprobados", "Abandonos"], "name_en": ["Promoted", "Repeated", "Abandoned"]}
stat_df = pd.DataFrame(stat_tb)
stat_df.to_csv("promotion.csv", index = False)
print("{:.2f} s | Exported promotion.csv".format(time.perf_counter()-before))

# creates CSV with education level IDs
edu_tb = {"education_id": [1,2], "name_es": ["Enseñanza Básica", "Enseñanza Media"], "name_en": ["Elementary School", "High School"]}
edu_df = pd.DataFrame(edu_tb)
edu_df.to_csv("education.csv", index = False)
print("{:.2f} s | Exported education.csv".format(time.perf_counter()-before))

# comes back to original path
os.chdir(absolute_path)