# Parsing military deployment

In [2]:
import numpy as np
import re
import pandas as pd
import polars as pl
import warnings
warnings.filterwarnings("ignore")

In [9]:
paragraphs = pd.read_excel(r"Data\paragraphs.xlsx")
paragraphs.head()

Unnamed: 0,country_co,paragraph,year,COUNTRY_CO,deployer,cowc1,ccode1,year_deployer,year_ccode1
0,Canada CAN,\nCanadian Dollar $\n2011\n2012\n2013\nGDP \n...,2012,CANADA CAN,Canada,CAN,20,2012_Canada,2012_20
1,United States US,\nUnited States Dollar $\n2011\n2012\n2013\nG...,2012,UNITED STATES US,United States,USA,2,2012_United States,2012_2
2,Albania ALB,\nAlbanian Lek \n2011\n2012\n2013\nGDP \nlek ...,2012,ALBANIA ALB,Albania,ALB,339,2012_Albania,2012_339
3,Albania ALB,\nAlbanian Lek \n2011\n2012\n2013\nGDP \nlek ...,2012,ALBANIA ALB,Albania,ALB,339,2012_Albania,2012_339
4,Austria A,UT \nEuro € \n2011\n2012\n2013\nGDP \n€\n301bn...,2012,AUSTRIA A,Austria,AUS,305,2012_Austria,2012_305


### Retrieving deployment section

In [15]:
def find_deployment(paragraph):
    pattern1 = re.compile('\nDEPLOYMENT')
    pattern2 = re.compile('\nFOREIGN\sFORCES')
    deployment_section_final = ''
    if len(re.findall(pattern1, paragraph)) > 0:
        deployment_section = re.split(pattern1, paragraph)[1]
        if len(re.findall(pattern2, deployment_section)) > 0:
            deployment_section = re.split(pattern2, deployment_section)[0]
            
        
    else:
        deployment_section = 'no deployment'
    return deployment_section

In [16]:
%%time
paragraphs['deployment'] = paragraphs['paragraph'].apply(lambda x: find_deployment(x))

CPU times: total: 46.9 ms
Wall time: 43.6 ms


### Retrieving foreign forces section

In [17]:
def find_foreign_forces(paragraph):
    pattern1 = re.compile('\nFOREIGN\sFORCES')
    if len(re.findall(pattern1, paragraph)) > 0:
        foreign_forces_section = re.split(pattern1, paragraph)[1]
    else:
        foreign_forces_section = 'no foreign forces'
    return foreign_forces_section

In [18]:
%%time
paragraphs['foreign_forces_section'] = paragraphs['paragraph'].apply(lambda x: find_foreign_forces(x))

CPU times: total: 31.2 ms
Wall time: 30.7 ms


### Parsing deployment section

In [19]:
deployment_data = paragraphs[paragraphs.deployment !='no deployment']

In [20]:
host = pd.read_excel('host.xlsx')
host.head()

Unnamed: 0,host_balance,host,ccode2,cowc2
0,AFGHANISTAN,Afghanistan,700,AFG
1,ALBANIA,Albania,339,ALB
2,ALGERIA,Algeria,615,ALG
3,ANDORRA,Andorra,232,AND
4,ANGOLA,Angola,540,ANG


In [22]:
%%time
deployment_data['deployment_parsed'] = deployment_data['deployment'].apply(lambda x: parse_deployment_section(x) )

CPU times: total: 188 ms
Wall time: 197 ms


In [23]:
def get_hosts(deployment_section):
    pattern = re.compile(r'(' + '|\n'.join(host['host_balance'])   + ')')
    deployment_splitted = re.split(pattern, deployment_section  )
    deployment_splitted = list(filter(None, deployment_splitted)) 
    deployment_splitted = [i.strip() for i in deployment_splitted]
    deployment_splitted = list(filter(None, deployment_splitted))
    if 'provisions' in deployment_splitted[0]:
        deployment_splitted = deployment_splitted[1:]
    hosts = deployment_splitted[0::2]
    return hosts 

def get_additional_info(deployment_section):
    pattern = re.compile(r'(' + '|\n'.join(host['host_balance'])   + ')')
    deployment_splitted = re.split(pattern, deployment_section  )
    deployment_splitted = list(filter(None, deployment_splitted)) 
    deployment_splitted = [i.strip() for i in deployment_splitted]
    deployment_splitted = list(filter(None, deployment_splitted))
    if 'provisions' in deployment_splitted[0]:
        deployment_splitted = deployment_splitted[1:]
    add_info = deployment_splitted[1::2]
    return add_info

In [24]:
%%time
deployment_data['host_balance'] = deployment_data['deployment'].apply(lambda x: get_hosts(x) )

deployment_data['Additional_Info'] = deployment_data['deployment'].apply(lambda x: get_additional_info(x) )

CPU times: total: 375 ms
Wall time: 375 ms


In [25]:
deployment_data['host_len'] = deployment_data['host_balance'].apply(lambda x: len(x))
deployment_data['add_info'] = deployment_data['Additional_Info'].apply(lambda x: len(x))


deployment_data['need_check'] = np.where(
    deployment_data['host_len'] != deployment_data['add_info'] , 1, 0
)

error_abolute = deployment_data[deployment_data.need_check ==1 ].shape[0]
error_percentage = round(100 * deployment_data[deployment_data.need_check ==1 ].shape[0]/deployment_data.shape[0] ,2 )

print("Функция ошибается всего на {} случаях деплоймента - т е одной страны в отдельный год.\
\nЭто {} от удачного числа спарсенных деплойментов".format(error_abolute, error_percentage))

print("")


deployment_data_clear = deployment_data[deployment_data.need_check !=1 ]
deployment_data_clear = deployment_data_clear[[ 'deployment', 'ccode1', 'deployer', 'host_balance', 'Additional_Info', 'year']]

Функция ошибается всего на 1 случаях деплоймента - т е одной страны в отдельный год.
Это 0.06 от удачного числа спарсенных деплойментов



In [29]:
balance_parsed = deployment_data_clear.explode('host_balance')
add_info = deployment_data_clear.explode('Additional_Info')['Additional_Info'].to_list()
balance_parsed['Additional_Info'] = add_info
balance_parsed = balance_parsed.merge(host, how= 'left', on ='host_balance')
balance_parsed[['ccode1', 'deployer', 'host', 'ccode2','cowc2', 'Additional_Info' ]]

Unnamed: 0,ccode1,deployer,host,ccode2,cowc2,Additional_Info
0,20,Canada,Afghanistan,700,AFG,NATO • ISAF (NTM-A) • Operation Attention 529
1,20,Canada,Arabian Sea,no_ccode2,no_cowc2,& GULF OF ADEN\nCombined Maritime Forces • CTF...
2,20,Canada,Cyprus,352,CYP,UN • UNFICYP (Operation Snowgoose) 1
3,20,Canada,Democratic Republic of the Congo,490,DRC,UN • MONUSCO (Operation Crocodile) 8 obs
4,20,Canada,Egypt,651,EGY,MFO (Operation Calumet) 28
...,...,...,...,...,...,...
9906,552,Zimbabwe,Democratic Republic of the Congo,490,DRC,: UN • \nMONUSCO 3
9907,552,Zimbabwe,Mozambique,541,MZM,: SADC • SAMIM 1
9908,552,Zimbabwe,Mozambique,541,MZM,: SADC • SAMIM 1
9909,552,Zimbabwe,South Sudan,626,SSD,: UN • UNMISS 14
