In [11]:
import pandas as pd
import geopandas as gpd
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib as mpl

In [6]:
absolute_country_edits = pd.read_csv("../../data/fig2_absolute_country_edits_monthly_raw.csv").drop(columns={'Unnamed: 0'})

In [19]:
corporate_country_edits = pd.read_csv("../../data/fig2_corporate_country_edits_monthly_raw.csv").drop(columns={'Unnamed: 0'})

In [20]:
corporate_country_edits = corporate_country_edits.rename(columns = {"total": "corporate"})

In [21]:
def get_group(df, year):
    result = df.groupby('year').get_group(year).groupby('ISO_A3').sum()
    return result
    

In [22]:
years = [2019, 2020, 2021, 2022, 2023]

In [23]:
frames = []

In [24]:
frames_corporate = []

In [25]:
for year in years:
    df = get_group(absolute_country_edits, year)
    df = df.drop(columns = {'month', 'year'})
    df = df.rename(columns = {'total' : year})
    frames.append(df)

In [26]:
for year in years:
    df = get_group(corporate_country_edits, year)
    df = df.drop(columns = {'month', 'year'})
    #df = df.rename(columns = {'total' : year})
    frames_corporate.append(df)

In [27]:
y19 = pd.merge(frames[0], frames_corporate[0], on = 'ISO_A3', how = 'left').reset_index().rename(columns = {2019 : 'total'})

y20 = pd.merge(frames[1], frames_corporate[1], on = 'ISO_A3', how = 'left').reset_index().rename(columns = {2020 : 'total'})

y21 = pd.merge(frames[2], frames_corporate[2], on = 'ISO_A3', how = 'left').reset_index().rename(columns = {2021 : 'total'})
y22 = pd.merge(frames[3], frames_corporate[3], on = 'ISO_A3', how = 'left').reset_index().rename(columns = {2022 : 'total'})
y23 = pd.merge(frames[4], frames_corporate[4], on = 'ISO_A3', how = 'left').reset_index().rename(columns = {2023 : 'total'})

In [28]:
y19['percentage'] = (y19['corporate']/ y19['total']) *100 
y20['percentage'] = (y20['corporate']/ y20['total']) *100 
y21['percentage'] = (y21['corporate']/ y21['total']) *100 
y22['percentage'] = (y22['corporate']/ y22['total']) *100 
y23['percentage'] = (y23['corporate']/ y23['total']) *100 


In [29]:
y19 = y19.fillna(0)
y20 = y20.fillna(0)
y21 = y21.fillna(0)
y22 = y22.fillna(0)
y23 = y23.fillna(0)

In [34]:
hdi  = gpd.read_file(r"C:\Users\Lilly\Documents\anaconda_uni\ba\Analysis\1_World\UpdatedVersions\HDI-2021.csv")

In [35]:
world_regions = gpd.read_file(r"C:\Users\Lilly\Documents\anaconda_uni\ba\Analysis\1_World\UpdatedVersions\world-regions-according-to-the-world-bank.csv")

In [36]:
world_regions = world_regions.rename(columns = {"Entity": "Country"})

In [37]:
hdiRegions = pd.merge(world_regions, hdi, on='Country', how='left')

hdiRegions = hdiRegions.rename(columns = {"Code": "ISO_A3"})

hdiRegions = hdiRegions.rename(columns = {"World Region according to the World Bank": "WorldRegions"})

hdiRegions = hdiRegions.drop(columns=['geometry_x', 'geometry_y', 'Year'])

In [38]:
hdiRegions

Unnamed: 0,Country,ISO_A3,WorldRegions,HDI 2021,group
0,Afghanistan,AFG,South Asia,0478,low
1,Albania,ALB,Europe and Central Asia,0796,high
2,Algeria,DZA,Middle East and North Africa,0745,high
3,American Samoa,ASM,East Asia and Pacific,,
4,Andorra,AND,Europe and Central Asia,0858,very high
...,...,...,...,...,...
212,Venezuela,VEN,Latin America and Caribbean,,
213,Vietnam,VNM,East Asia and Pacific,,
214,Yemen,YEM,Middle East and North Africa,0455,low
215,Zambia,ZMB,Sub-Saharan Africa,0565,medium


In [32]:
def calcPercentageHDI(df, year, newCorpoColumnName, newTotalCorpoName):
    data =  df
    table = pd.merge(data, hdiRegions, on='ISO_A3', how='left')
    #table = table.drop(columns=['geometry'])
    table["corporate"] = table["corporate"].astype(int)
    table["total"] = table["total"].astype(int)

    table = table.rename(columns={"corporate": newCorpoColumnName, 
                                  "total": newTotalCorpoName})
    
    tableNew = table.groupby("group", group_keys=True, dropna=True).agg({
        newCorpoColumnName: "sum",   # Use "sum" for summation
        newTotalCorpoName : "sum"         # Use "sum" for summation
    })
    tableNew[year] = (tableNew[newCorpoColumnName] / tableNew[newTotalCorpoName]) * 100

    tableNew = tableNew.drop(columns=[newCorpoColumnName, newTotalCorpoName])
   
    return tableNew

In [146]:
h2019 = calcPercentageHDI(y19, '2019', 'Corporate19', 'Total2019')

In [148]:
h2020 = calcPercentageHDI(y20, '2020', 'Corporate20', 'Total2020')
h2021 = calcPercentageHDI(y21, '2021', 'Corporate21', 'Total2021')
h2022 = calcPercentageHDI(y22, '2022', 'Corporate22', 'Total2022')
h2023 = calcPercentageHDI(y23, '2023', 'Corporate23', 'Total2023')

In [149]:
HDIAgg = h2019.join([h2020, h2021, h2022, h2023])

In [3]:
HDIAgg

Unnamed: 0,group,2019,2020,2021,2022,2023
0,high,19.342836,26.825003,30.870386,24.667697,12.64141
1,low,1.023491,1.779802,2.838133,3.553033,7.487605
2,medium,4.679001,20.112227,24.51619,18.84367,12.044844
3,very high,2.1899,2.672881,5.108599,3.018649,2.739404


In [151]:
HDIAgg.to_csv('corporate_HDI.csv')

### HDI for su t0-t1 results

In [53]:
df = gpd.read_file('../../data/world_boundaries_with_stats_centroid.gpkg')

display(df.columns)

Index(['ISO_A2', 'ISO_A2_EH', 'ISO_A3', 'ISO_A3_EH', 'ISO_N3', 'ISO_N3_EH',
       'NAME_EN', 'country_iso_a3', 'post_t1_avg_monthly_edits_non_corporate',
       'post_t1_avg_monthly_edits_corporate',
       'post_t1_avg_monthly_contributors_non_corporate',
       'post_t1_avg_monthly_contributors_corporate',
       'post_t1_sum_edits_non_corporate', 'post_t1_sum_edits_corporate',
       'pre_t0_avg_monthly_edits_non_corporate',
       'pre_t0_avg_monthly_edits_corporate',
       'pre_t0_avg_monthly_contributors_non_corporate',
       'pre_t0_avg_monthly_contributors_corporate',
       'pre_t0_sum_edits_non_corporate', 'pre_t0_sum_edits_corporate',
       't0_avg_monthly_edits_non_corporate', 't0_avg_monthly_edits_corporate',
       't0_avg_monthly_contributors_non_corporate',
       't0_avg_monthly_contributors_corporate', 't0_sum_edits_non_corporate',
       't0_sum_edits_corporate', 't1_avg_monthly_edits_non_corporate',
       't1_avg_monthly_edits_corporate',
       't1_avg_monthly

In [54]:
columns = [
    "country_iso_a3",
    "sum_edits_non_corporate",
    "sum_edits_corporate",
    "share_sum_edits_corporate",
    ]

display(df[columns].sort_values("share_sum_edits_corporate", ascending = False))

Unnamed: 0,country_iso_a3,sum_edits_non_corporate,sum_edits_corporate,share_sum_edits_corporate
0,URY,204125.0,805613.0,0.798
1,MEX,5815039.0,12991241.0,0.691
2,EGY,2843884.0,5876043.0,0.674
3,QAT,215098.0,381910.0,0.640
4,ARE,1322746.0,1742396.0,0.568
...,...,...,...,...
172,,,,
173,,,,
174,,,,
175,,,,


In [55]:
country = df[columns].rename(columns={'country_iso_a3': 'ISO_A3', 'sum_edits_non_corporate' : 'nc', 'sum_edits_corporate': 'corporate'})

In [56]:
country['total'] = country['nc'] + country['corporate'] 

In [59]:
country = country.fillna(0)

In [60]:
tot1_HDI = calcPercentageHDI(country, '2019', 'Corporate19', 'Total2019')

In [61]:
tot1_HDI

Unnamed: 0_level_0,2019
group,Unnamed: 1_level_1
high,30.305508
low,3.519919
medium,19.764207
very high,6.240535
