# Parsing country paragraphs of Military Balance

Before obtaining any essential inoformation about country's military capabilities would be easier if I firstly __split__ MB annual report into countries-paragraphs, containing the precious data. That would avoid errors in the further stages.

In [75]:
import polars as pl
import numpy as np
import fitz
import re
import pandas as pd

#### Reading balances

In [13]:
def read_balance(report_year:int, path:str):
    """
    Function receives the year of military balance report and path to pdf report
    Returns the text (string format) of the report
    """
    path =  path + str(report_year) +  ".pdf"
    pages = []
    with fitz.open(path) as doc:
         for page in doc:
                pages.append(page.get_text())
    balance_text = '\n'.join(pages)
    return balance_text      

In [14]:
%%time
balances_2013_2022 = [read_balance(i, "balances pdf/") for i in range(2013, 2024)]

CPU times: total: 20 s
Wall time: 20.3 s


#### Loading countries headings

I manually collected all possible countries' headings(they might vary from one balance to another) in txt file. There countries with problem headings - due to pdf format. So I use currencies as splitting token instead of headings. 

The headings contains country name plus abbreaviation code (the changeable part).For instance - Lithunian headings
 - Lithuania L
 - Lithuania LTU

In [184]:
with open('country_co.txt', encoding = 'utf-8') as file:
    lines  =  file.readlines() 
country_co = '!!!'.join(lines)
country_co = re.sub('\d','', country_co)
country_co = re.sub('\n',' ', country_co)
country_co = re.sub('\t',' ', country_co)
country_co = set(country_co.split('!!!') )
country_co = ['\n' + i.strip()  for i in country_co]

filtered_country_co =[]
regex = re.compile(r'(FYROM|Macedonia|Palesti|Gaza|DPRK|Jericho\sPA|DRC|Democratic\sPeopl|Democratic\sRepublic\sof|DRC|Congo\sDRC|Democratic\sRepublic\sof\sCongo\sDRC)')

for i in country_co:
    if len(re.findall(regex,i ) ) < 1 :
        filtered_country_co.append(i)
filtered_country_co.remove('\n')
filtered_country_co.extend(['\nMacedonian Denar', 
                            '\nNorth Korean Won',
                            '\nPalestinian Autonomous Areas',
                            '\nCongolese Franc',
                            "\nChina, People's Republic of PRC",
                            "\nCote D'Ivoire CIV"
                         
                           ])
country_co = sorted(filtered_country_co)
country_co.remove('\nSouth Sudan')


In [185]:
def split_paragraphs(balance, year):
    deployers = []
    paragraphs = []
    countries_dict = {}
    pattern = re.compile('(' + '|'.join(country_co) + ')')
    balance_splitted = re.split(pattern, balance)
    balance_cleaned = list(filter(None, balance_splitted))
    balance_cleaned = balance_cleaned[1:-2]
    countries_dict = {
        'country_co': balance_cleaned[0::2], 'paragraph':balance_cleaned[1::2]
    }
    df = pd.DataFrame(countries_dict)
    df['year'] = year
    df['country_co'] = df['country_co'].str.replace('\n', '')
    df['COUNTRY_CO'] = df['country_co'].str.upper()
    df.drop_duplicates(subset='country_co', inplace= True)
    country_co_df = pd.read_excel('country_co_df.xlsx')
    country_co_df['COUNTRY_CO'] = country_co_df['country_co'].str.upper()
    df = df.merge(country_co_df[['COUNTRY_CO', 'deployer', 'cowc1', 'ccode1']], how='left', on ='COUNTRY_CO')
    #df.drop_duplicates(subset=['ccode1'], inplace=True)
    df['year_deployer'] = df['year'].astype(str) + '_' + df['deployer'].astype(str)
    df['year_ccode1'] = df['year'].astype(str) + '_' + df['ccode1'].astype(str)
    return df

In [186]:
%%time
countries_table = pd.DataFrame()
year_balance = 2012

for i in balances_2013_2022:
    res = split_paragraphs(i, year_balance)
    countries_table = pd.concat([ countries_table, res])
    
    print("Balance of {}  parsed successfully".format( year_balance))
    year_balance +=1
res = countries_table

Balance of 2012  parsed successfully
Balance of 2013  parsed successfully
Balance of 2014  parsed successfully
Balance of 2015  parsed successfully
Balance of 2016  parsed successfully
Balance of 2017  parsed successfully
Balance of 2018  parsed successfully
Balance of 2019  parsed successfully
Balance of 2020  parsed successfully
Balance of 2021  parsed successfully
Balance of 2022  parsed successfully
CPU times: total: 906 ms
Wall time: 941 ms


In [188]:
res.to_excel(r"\data\paragraphs.xlsx", index = None)