In [210]:
from countrymerger import *
import pandas as pd
import numpy as np
import warnings
from datetime import date

def convert_country_df(data: pd.DataFrame, data_id_col: str, separator: str = None, standard_to_convert: str = "STATE_en_UN", warning = True):
    """ Function to convert a series of country ids to a different standard

    ---
    Parameters:
        data (Pandas.DataFrame): Data with country identifiers
        data_id_col (str) : Country id column name
        separator (str): separator of country identifiers in a cell (can be string or None)
        standard_to_convert (str): standard to convert to from the list pf supported standrds, to see the list consult countrymerger.KEY_COLUMNS. Default is STATE_EN_UN for UN standard country names (http://unterm.un.org).
        warning (bool): issue warnings when unable to idntify country id.

    ---
    Returns:
        converted_country_series (Pandas.Series): country_series in a new format
    """
    replacement_dict = get_id_dict(data[data_id_col], separator, standard_to_convert, warning)
    data_replaced = data.copy()
    for i in replacement_dict.items():
        #print(i[0], i[1])
        data_replaced[data_id_col] = data_replaced[data_id_col].apply(lambda x: str(x).replace(str(i[0]),str(i[1])))
    return data_replaced


def get_id_set(country_series: pd.Series, separator: str = None):
    """Returns a set of countries from a series of country ids (possibly with separators)
    """
    def _add_to_country_list(country_list, country_ids, separator):
        if pd.isna(country_ids): return None
        country_ids = str(country_ids)
        #print(country_ids)
        if separator is None:
            country_list += [country_ids.strip()]
        elif separator in country_ids:
            #print([country_id.strip() for country_id in country_ids.split(separator)])
            country_list += [country_id.strip() for country_id in country_ids.split(separator)]
        else:
            #print(country_ids.strip())
            country_list += [country_ids.strip()]
    country_list = list()
    country_series.apply(lambda x: _add_to_country_list(country_list, x, separator))
    #print(country_list)
    country_set = set(country_list)
    return country_set

def get_id_dict(country_series: pd.Series, separator: str, standard_to_convert: str = "STATE_en_UN", warning = True):
    """Returns a dict of old country ids and new ones
    from a series of country ids (possibly with separators)
    """
    country_set = get_id_set(country_series, separator)

    keys_df = loadKeyDf(load_extra=True)
    converting_x = keys_df.columns.get_loc(standard_to_convert)
    keys = keys_df.values
    keys_lower = keys_df.applymap(lambda s: s.lower() if type(s) == str else s)
    conversion_dict = {}
    
    for country in country_set:
        country_lower = country.lower()
        try:
            i = np.where(keys_lower == country_lower)[0][0]
            country_converted = keys[i, converting_x]
            conversion_dict[country] = country_converted
        except IndexError:
            if warning: warnings.warn(f"Unknown identifier {country}, keeping as is")
            conversion_dict[country] = country # currently not supporting purging unknpwn identifiers
    return conversion_dict

In [211]:
df = pd.read_csv("../data_in/int_law - security_council.csv")
df_converted = convert_country_df(df, 'Country', '||', "COW_Country_Code")

In [212]:
df_converted['Date'] = pd.to_datetime(df_converted['Date'])

In [335]:
def get_undoc_type(elems, meeting_record, type_identifier="RES/", return_nontype=False):
    """returns a list of document URLs by URL str identifier"""
    if pd.isna(elems): return np.nan
    elem_list = separate_elems(str(elems), "||")
    correct_type = []
    for elem in elem_list:
        if type_identifier in elem:
            if not return_nontype:
                if type_identifier == "/S/RES/":
                    correct_type += elem.replace(" (", "("), #fixing URLs
                else:    
                    correct_type += elem,
        else:
            if return_nontype:
                correct_type += elem,
            else:
                # special treatment of non-adopted res
                if type_identifier == "/S/RES/":
                    if ("/S/20" in elem) or ("/S/19" in elem):
                        correct_type = correct_type + [elem + f" (NOT ADOPTED, consult meeting record: {meeting_record})"]
                # special treatment of 1992 press releases / notes
                if type_identifier == "/S/PRST/":
                    if (("/S/23" in elem) or ("/S/24" in elem)) and ("/S/RES/" not in elem):
                        #print(f"fix {elem}")
                        correct_type += elem,
    if len(correct_type) == 0:
        return np.nan
    else:
        return " || ".join(correct_type)

df_converted['URL Press Statement'] = df_converted.apply(lambda x: get_undoc_type(x['URL Security Council Outcome / Vote'], x['URL Meeting Record'],"/S/PRST/"), axis=1)
df_converted['URL Resolution'] = df_converted.apply(lambda x: get_undoc_type(x['URL Security Council Outcome / Vote'], x['URL Meeting Record'],"/S/RES/"), axis=1)


In [360]:
def separate_elems(elems: str, separator: str):
    """Separates elements of a string with a separator, returning a list of individual elements"""
    separated_list = list()
    if separator is None:
        separated_list = [elems.strip()]
    elif separator in elems:
        separated_list = [elem.strip() for elem in elems.split(separator)]
    else:
        separated_list = [elems.strip()]
    return separated_list


def map_data(df_master, df_slave, col_master_id, col_slave_id, col_master_data, col_slave_data,
             col_master_year, col_master_month, col_slave_date,
             separator_res, separator_master=None, separator_slave=None):
    """Map data from one df to another with support to multiple idenrifiers with separators
    """
    def _values_for_master_ids(id_master, df_slave, col_slave_id, col_slave_data, 
                               date_master, col_slave_date, separator_master, separator_slave):
        """Replace a set of master ids for slave values"""

        #df_slave[col_slave_year] = pd.to_numeric(df_slave[col_slave_year])
        #year_master = int(year_master)
        
        ids_master = separate_elems(id_master, separator_master)

        documents = []
        for i, id_master_single in enumerate(ids_master):
            data = df_slave[df_slave[col_slave_id].str.contains(id_master_single)]
            data['delta_days'] = data[col_slave_date].apply(lambda x: abs((x.date() - date_master).days))
            if data.shape[0]>0:
                data = data[data['delta_days']<360]
            #if id_master_single == '404': print(data)
            if data[col_slave_data].dropna().shape[0]>0:
                data_temp = data[data['delta_days']<180]
                if data_temp[col_slave_data].dropna().shape[0]>0:
                    data = data_temp
                    data_temp = data[data['delta_days']<90]
                    if data_temp[col_slave_data].dropna().shape[0]>0:
                        data = data_temp
                        data_temp = data[data['delta_days']<30]
                        if data_temp[col_slave_data].dropna().shape[0]>0:
                            data = data_temp
                            data_temp = data[data['delta_days']<20]
                            if data_temp[col_slave_data].dropna().shape[0]>0:
                                data = data_temp
            #print(id_master)
            #print(data[col_slave_data].values)
            if len(data[col_slave_data].dropna().values) != 0:
                documents += list(data[col_slave_data].dropna().values)
        #print(documents)
        values = (separator_res.join(documents))
        #if id_master_single == '404': 
        #    print("------\n")
        #    print(values)
        #    print("---END----")
        #print(values)
        return values

    df_master['Date'] = df_master.apply(lambda x: date(int(x[col_master_year]), int(x[col_master_month]), 15), axis=1)
    #print(df_master['Date'])
    df_master[col_master_data] = df_master.apply(lambda x: _values_for_master_ids(x[col_master_id], df_slave, col_slave_id, col_slave_data, 
                                                                                                 x['Date'], col_slave_date, 
                                                                                                 separator_master, separator_slave),
                                                axis = 1)
    return df_master

In [361]:
law_df = pd.read_csv("../data_in/int_law - law_dyad.csv").dropna(subset=['old_id'])
law_df = map_data(df_master=law_df, df_slave=df_converted, col_master_id='refobject_ccode', col_slave_id='Country', 
         col_master_data='source_unsc_transcript', col_slave_data='URL Meeting Record',
         col_master_year='year_start', col_master_month='month_start', col_slave_date='Date',
                      separator_res=' || ', separator_master= ';', separator_slave='||')
law_df = map_data(df_master=law_df, df_slave=df_converted, col_master_id='refobject_ccode', col_slave_id='Country', 
         col_master_data='source_unsc_resolution', col_slave_data='URL Resolution',
         col_master_year='year_start', col_master_month='month_start', col_slave_date='Date',
                      separator_res=' || ', separator_master= ';', separator_slave='||')
law_df = map_data(df_master=law_df, df_slave=df_converted, col_master_id='refobject_ccode', col_slave_id='Country', 
         col_master_data='source_unsc_prst', col_slave_data='URL Press Statement',
         col_master_year='year_start', col_master_month='month_start', col_slave_date='Date',
                      separator_res=' || ', separator_master= ';', separator_slave='||')

In [362]:
law_df.to_csv("../data_out/int_law - law_dyad.csv")

In [269]:
df_converted['URL Resolution'].dropna().iloc[102]

'https://undocs.org/en/S/2021/990 (NOT ADOPTED, consult meeting record: https://undocs.org/en/S/PV.8926)'

In [67]:
law_df.columns

Index(['i_dyad_id', 'old_id', 'i_name_en', 'pivot_state', 'coalition',
       'refobject', 'refobject_ccode', 'year_start', 'oldsources_lawself',
       'oldsources_lawgreat', 'source_unsc_resolution',
       'source_unsc_transcript', 'source_court_of_law', 'US_source',
       'Russia_source', 'China_source', 'UK_source', 'France_source',
       'India_source', 'Germany_source', 'court_of_law', 'ro_source', 'unsc',
       'unga'],
      dtype='object')