# Cleaning Dataframe

In [5]:
import pandas as pd
import openpyxl

In [10]:
# Funciton that cleans the dataframe
def Data_Cleaner(data_path: str) -> pd.DataFrame:
    asylum_raw = pd.read_excel(data_path, sheet_name="DATA")

    # Drop columns not in use
    asylum_dropped_rows = asylum_raw.drop(columns=['origin', 'asylum'])

    # Make data more readable with change of abbreviation
    replacement = {
        'REF' : 'Refugee',
        'ASY' : 'Asylum-seekers',
        'ROC' : 'People in refugee-like situation',
        'OIP' : 'Other people in need of international protection'
    }
    asylum_name_changed = asylum_dropped_rows.replace({"PT": replacement})

    # Make column names more readable
    new_column_names = {
        "OriginISO" : "country_of_origin_abbr",
        "OriginName" : "country_of_origin_name",
        "AsylumISO" : "country_of_asylum_abbr",
        "AsylumName" : "country_of_asylum_name",
        "AsylumRegion" : "region_of_asylum",
        "PT" : "category",
        "Year" : "year",
        "Count" : "count"
    }
    asylum_clean = asylum_name_changed.rename(columns=new_column_names)

    rows_to_delete = asylum_clean[asylum_clean['country_of_origin_abbr'].isin(['TIB'])].index
    asylum_clean = asylum_clean.drop(rows_to_delete)
    # Delete 'Not classified' is empty
    # fix West Bank and Gaza problem

    return asylum_clean

def Clean_Population_Data(data_path: str) -> pd.DataFrame:
    data = pd.read_csv(data_path, skiprows=4)

    columns_to_drop = ["Indicator Name", "Indicator Code", 'Unnamed: 68']
    data_new = data.drop(columns=columns_to_drop)

    # Not classified has no data, west bank and gaza has to be analyzed by itself
    rows_to_delete = data_new[data_new['Country Name'].isin(['West Bank and Gaza', 'Not classified'])].index
    data_nn = data_new.drop(rows_to_delete)

    # TODO: delete the year 2024 from asylum
    # TODO: Gaza should be deleted from the df but exported to a new cvs file

    return data_nn

def Clean_Flags_Data(data_path: str) -> pd.DataFrame:
    countries_flags = pd.read_csv(data_path)
    countries_flags = countries_flags.drop(columns=["Country", "Alpha-2 code"])
    return countries_flags

clean_data_asylum = Data_Cleaner(".\\Data\\Raw\\UNHCR_Flow_Data.xlsx")
clean_data_asylum.to_csv(".\\Data\\Clean\\Asylum_data.csv", index=False)

clean_data_population = Clean_Population_Data(".\\Data\\Raw\\API_SP.POP.TOTL_DS2_en_csv_v2_87.csv")
clean_data_population.to_csv(".\\Data\\Clean\\Population_data.csv", index=False)

clean_flags_data = Clean_Flags_Data(".\\Data\\Raw\\flags_iso.csv")
clean_flags_data.to_csv(".\\Data\\Clean\\Flags.csv")

