# UCDP parser

This parser takes care of filtering out the information we deem necessary from the [UCDP dataset](http://ucdp.uu.se).

In [1]:
import pandas as pd

In [2]:
# Defining IO paths
raw_ucdp_dataset_file = '../data/raw/raw_ucdp.csv'
parsed_ucdp_dataset_file = '../data/parsed/parsed_ucdp.csv'
parsed_country_nationality_dataset_file = '../data/parsed/parsed_country_nationality.csv'

In [3]:
# Fetch data from raw dataset
ucdp_df = pd.read_csv(raw_ucdp_dataset_file, index_col='id', compression='gzip')
country_nationality_df = pd.read_csv(parsed_country_nationality_dataset_file, index_col='ID', compression='gzip')

In [4]:
# Create auxiliary lists for information keeping/cleanup
ucdp_interesting_columns = [
    'year',
    'type_of_violence',
    'conflict_name',
    'country',
    'date_start',
    'date_end'
]

ucdp_auxiliary_columns = [
    'side_a',
    'side_b',
    'deaths_a',
    'deaths_b',
    'deaths_civilians',
    'deaths_unknown'
]

ucdp_interesting_column_names = [
    'Year',
    'Type',
    'Conflict Name',
    'Country',
    'Date Start',
    'Date End'
]

# Only keep necessary columns and create auziliary dataframe
auxiliary_df = ucdp_df[ucdp_auxiliary_columns]
ucdp_df = ucdp_df[ucdp_interesting_columns]

# Rename interesting columns
interesting_column_rename_dict = dict(zip(ucdp_interesting_columns, ucdp_interesting_column_names))
ucdp_df = ucdp_df.rename(columns=interesting_column_rename_dict)

Sample:

In [5]:
ucdp_df.head()

Unnamed: 0_level_0,Year,Type,Conflict Name,Country,Date Start,Date End
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,2010,1,Yemen (North Yemen):Government,Yemen (North Yemen),2010-09-25,2010-09-25
5,2011,3,Government of Yemen (North Yemen) - Civilians,Yemen (North Yemen),2011-02-19,2011-02-19
6,2011,1,Yemen (North Yemen):Government,Yemen (North Yemen),2011-04-16,2011-04-16
7,2012,1,Yemen (North Yemen):Government,Yemen (North Yemen),2012-06-04,2012-06-04
10,2012,1,Yemen (North Yemen):Government,Yemen (North Yemen),2012-12-09,2012-12-09


In [6]:
def get_total_casualties(auxiliary_df):
    '''
    This method calculates the total amount of
    casualties for any given conflict and returns
    it as a dataframe (indexed like the input
    dataframe).
    '''
    casualties_dict = {}
    for index, row in auxiliary_df.iterrows():
        side_a = row['side_a']
        side_b = row['side_b']
        
        casualties_sum = row['deaths_a'] + row['deaths_b'] + row['deaths_unknown']
        if (side_a != 'Civilians' and side_b != 'Civilians'):
            casualties_sum += row['deaths_civilians']

        casualties_dict[index] = {'Casualties':casualties_sum}
    return pd.DataFrame.from_dict(casualties_dict, orient='index')

In [7]:
# Get total casualties per conflict and store it in a dataframe
casualties_df = get_total_casualties(auxiliary_df)

# Add the total amount of casualties to each conflict
ucdp_df = ucdp_df.merge(casualties_df, left_index=True, right_index=True, how='inner')

Sample:

In [8]:
ucdp_df.head()

Unnamed: 0_level_0,Year,Type,Conflict Name,Country,Date Start,Date End,Casualties
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4,2010,1,Yemen (North Yemen):Government,Yemen (North Yemen),2010-09-25,2010-09-25,2
5,2011,3,Government of Yemen (North Yemen) - Civilians,Yemen (North Yemen),2011-02-19,2011-02-19,0
6,2011,1,Yemen (North Yemen):Government,Yemen (North Yemen),2011-04-16,2011-04-16,0
7,2012,1,Yemen (North Yemen):Government,Yemen (North Yemen),2012-06-04,2012-06-04,5
10,2012,1,Yemen (North Yemen):Government,Yemen (North Yemen),2012-12-09,2012-12-09,5


In [9]:
# Parse country names
ucdp_df['Country'] = ucdp_df['Country'].apply(lambda x: x.split('(')[0].strip())

# Get column for country column code substitution
country_codes_df = country_nationality_df['Common Name'].to_frame()
country_codes_df['Code'] = country_codes_df.index
country_codes_df.reset_index(drop=True)

# Replace country names with codes
ucdp_df = ucdp_df.merge(country_codes_df, left_on='Country', right_on='Common Name', how='inner').drop(['Country', 'Common Name'], axis=1)
ucdp_df = ucdp_df.rename(columns={'Code':'Country'})

Sample:

In [10]:
ucdp_df.head()

Unnamed: 0,Year,Type,Conflict Name,Date Start,Date End,Casualties,Country
0,2010,1,Yemen (North Yemen):Government,2010-09-25,2010-09-25,2,YE
1,2011,3,Government of Yemen (North Yemen) - Civilians,2011-02-19,2011-02-19,0,YE
2,2011,1,Yemen (North Yemen):Government,2011-04-16,2011-04-16,0,YE
3,2012,1,Yemen (North Yemen):Government,2012-06-04,2012-06-04,5,YE
4,2012,1,Yemen (North Yemen):Government,2012-12-09,2012-12-09,5,YE


In [11]:
# Save to csv file
ucdp_df = ucdp_df.reset_index()
ucdp_df.to_csv(parsed_ucdp_dataset_file, encoding='utf-8', index=False, compression='gzip')