In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np 
import matplotlib.pyplot as plt 
import duckdb
import shapely.wkb
from shapely.wkb import loads

import glob
import os

In [5]:
#data files
db = duckdb.read_parquet(r"D:\Datenbank\ohsome-stats-yearly\year=*\month=*\*.parquet", hive_partitioning=1) 

In [6]:
# All corporate Hashtags
hashtag_list = ['amap', 'adt', 'bolt', 'DigitalEgypt', 'expedia', 'gojek', 'MSFTOpenMaps', 'grab', 'Kaart', 'Kontur', 'mbx', 'RocketData',
                'disputed_by_claimed_by', 'Snapp', 'stackbox', 'Telenav', 'Lightcyphers', 'tomtom', 'TIDBO', 'WIGeoGIS-OMV', 'نشان',
                'mapbox', 'Komoot', 'AppLogica']

## Data Download

In [4]:
# Specify the directory where your Excel files are located
directory_path =  r"C:\Users\lilly\Documents\bachelorarbeit\analysis\UserNameID-v2"

# Get a list of all Excel files in the directory
excel_files = glob.glob(os.path.join(directory_path, "*.xls"))

# Create an empty dictionary to store DataFrames
dataframes_dict = {}

# Read each Excel file and store its DataFrame in the dictionary - files saved as xldr - otherwise when saved as csv, other method applicable
for excel_file in excel_files:
    filename = os.path.basename(excel_file)
    df = pd.read_csv(excel_file) 
    dataframes_dict[filename] = df

In [2]:
# get the user-ids in working format
def getListID(filename):
    df = dataframes_dict[filename]
    CorpoId = df['User ID']
    user_ids_str = ','.join([f"'{id}'" for id in CorpoId])
    return user_ids_str

In [None]:
Meta = getListID("MetaUser.xls")

In [11]:
def yealyEdits(year, filename):
    try: 

        # Construct the CASE statement to group hashtags together
        case_statement = "\n".join([
            f"WHEN hashtags ILIKE '%{hashtag}%' THEN '{hashtag}'" for hashtag in hashtag_list
        ])
        
        # Construct the SQL query
        query = f"""
            SELECT
                CASE
                    {case_statement}
                    ELSE 'nc'
                END AS corporation,
                COUNT(*) AS edits
            FROM db
            WHERE year = {year}
            GROUP BY corporation
            ORDER BY edits DESC
        """
        
        # Execute the SQL query
        result = duckdb.sql(query)
        
        # Show the result
        hashtags = result.to_df()
    
    
        q2 = f"""
            SELECT COUNT(*) AS meta
            FROM db
            WHERE year = {year} AND user_id IN ({Meta})
            
        """
        r2 = duckdb.sql(q2)
        meta = r2.to_df()
    
        m = str(meta['meta'].values[0]) 
        d = {'corporation': ['meta'], 'edits': [m]}
        meta_df = pd.DataFrame(data=d)
    
        
        frames = [hashtags, meta_df]
    
        table = pd.concat(frames)
    
        table.to_csv(filename, header = True)

    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
yealyEdits(2016, "totalEdits_2016.csv")

In [None]:
yealyEdits(2017, "totalEdits_2017.csv")

In [None]:
yealyEdits(2018, "totalEdits_2018.csv")

In [40]:
yealyEdits(2019, "totalEdits_2019.csv")

In [21]:
yealyEdits(2020, "totalEdits_2020.csv")

In [12]:
yealyEdits(2021, "totalEdits_2021.csv")

In [13]:
yealyEdits(2022, "totalEdits_2022.csv")

In [14]:
yealyEdits(2023, "totalEdits_2023.csv")

## Data Preperation

### Absolute Numbers

In [39]:
#extracting  the total corporate and non-corporate information yearly - since the files have saved it per corporation
def extrctingCEdata(csv, yearColumn): 
    df = pd.read_csv(csv)

    df = df.drop(columns={'Unnamed: 0'})
    
    #df = df.head(-1) #check the impact of meta
    total = df['edits'].sum()
    corporate = df['edits'].tail(-1).sum()
    non_corporate = df['edits'].head(1).sum()

     #calculating the percentage of corporate edits based on total edits
    percentage = (corporate / total) * 100

    d = {'year' : [yearColumn], 'total': [total], 'CE': [corporate], 'NCE': [non_corporate], 'percentage': [percentage]}

    pd.set_option('display.float_format', '{:.2f}'.format)
    
    df = pd.DataFrame(data=d)

    return df

In [40]:
y16 = extrctingCEdata('totalEdits_2016.csv', 2016)
y17 = extrctingCEdata('totalEdits_2017.csv', 2017)
y18 = extrctingCEdata('totalEdits_2018.csv', 2018)
y19 = extrctingCEdata('totalEdits_2019.csv', 2019)
y20 = extrctingCEdata('totalEdits_2020.csv', 2020)
y21 = extrctingCEdata('totalEdits_2021.csv', 2021)
y22 = extrctingCEdata('totalEdits_2022.csv', 2022)
y23 = extrctingCEdata('totalEdits_2023.csv', 2023)


In [41]:
frames = [y16, y17, y18, y19, y20, y21, y22, y23]

yearly_data = pd.concat(frames)

In [42]:
yearly_data = yearly_data.reset_index().drop(columns={'index'})

In [43]:
yearly_data

Unnamed: 0,year,total,CE,NCE,percentage
0,2016,163538605,54507,163484098,0.03
1,2017,192499784,2343678,190156106,1.22
2,2018,226081100,5260401,220820699,2.33
3,2019,231941235,13146644,218794591,5.67
4,2020,286643044,29510527,257132517,10.3
5,2021,309738042,34363469,275374573,11.09
6,2022,274127413,20133005,253994408,7.34
7,2023,173064319,9644921,163419398,5.57


In [44]:
yearly_data.to_csv('fig1_2016-2023_yearly_data_ce_nce.csv', header = True)

### top 10 edits per corporation

In [46]:
h16 = pd.read_csv('totalEdits_2016.csv')
h17 = pd.read_csv('totalEdits_2017.csv')
h18 = pd.read_csv('totalEdits_2018.csv')
h19 = pd.read_csv('totalEdits_2019.csv')
h20 = pd.read_csv('totalEdits_2020.csv')
h21 = pd.read_csv('totalEdits_2021.csv')
h22 = pd.read_csv('totalEdits_2022.csv')
h23 = pd.read_csv('totalEdits_2023.csv')

In [47]:
def datprepCE(year):
    df = pd.read_csv(f'totalEdits_{year}.csv')
    df = df.drop(columns={'Unnamed: 0'})
    df = df.rename(columns={'edits': year})
    return df

In [48]:
years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
dat = {'corporation' : []}
df_t = pd.DataFrame(data=dat)

In [49]:
for year in years:
    d = datprepCE(year)
    df_t = pd.merge(df_t, d, on = 'corporation', how = 'outer')

In [50]:
timeline = df_t.copy()
timeline = timeline.rename_axis('year', axis=1)
timeline = timeline.set_index('corporation')

In [51]:
timeline = timeline.fillna(0)

In [52]:
timeline.loc['mapbox'] += timeline.loc['mbx']

In [53]:
timeline.drop(['mbx'], inplace=True)

In [14]:
edit_count = timeline.transpose().sum()

In [15]:
edit_count.sort_values(ascending = False)

corporation
nc                       1743176390.00
meta                       35917491.00
adt                        31329879.00
Kaart                      30623363.00
amap                        6302076.00
MSFTOpenMaps                2116685.00
tomtom                      2101635.00
DigitalEgypt                1771911.00
grab                        1619427.00
mapbox                       860369.00
Telenav                      507189.00
bolt                         410599.00
نشان                         293964.00
RocketData                   275810.00
Lightcyphers                 182052.00
expedia                       63878.00
TIDBO                         37167.00
Kontur                        22691.00
stackbox                      10877.00
AppLogica                      4971.00
disputed_by_claimed_by         1805.00
WIGeoGIS-OMV                   1439.00
gojek                           992.00
Komoot                          798.00
Snapp                            84.00
dtype: float6

In [17]:
Other = timeline.copy()
Other = Other.transpose()

In [18]:
Other = Other[['Telenav', 'bolt', 'نشان','RocketData', 'Lightcyphers', 'expedia', 'TIDBO', 'Kontur', 'stackbox', 'AppLogica', 'disputed_by_claimed_by', 'WIGeoGIS-OMV', 'gojek', 'Komoot', 'Snapp']]

In [19]:
Other.transpose().sum()

year
2016     6234.00
2017    32982.00
2018    42399.00
2019   346339.00
2020   286941.00
2021   350070.00
2022   551924.00
2023   197427.00
dtype: float64

In [20]:
large = timeline.transpose().reset_index()
large = large[['year', 'meta', 'adt', 'Kaart', 'amap', 'MSFTOpenMaps', 'tomtom',  'DigitalEgypt', 'grab', 'mapbox']]

In [21]:
large['Other'] = ['6234.0', '32982.0', '42399.0', '346339.0', '286941.0', '350070.0', '551924.0', '197427.0']

In [22]:
large['Other'] = large['Other'].apply(pd.to_numeric, errors ='coerce')

### prep for percentage per corporation

In [24]:
# calculating the total number of PFs per year - taking the df created for the first graph
newTotal = yearly_data.copy()

totalCE = newTotal[['year', 'CE']]

In [25]:
# adding the total corporate numbers
top10List = pd.merge(totalCE, large, on = 'year', how ='left')

In [130]:
corpor10 = ['meta','adt', 'Kaart', 'amap', 'MSFTOpenMaps', 'tomtom', 'grab', 'DigitalEgypt', 'mapbox', 'Other']

In [131]:
# calculating th epercentage each corporation held based on the total corporate edits that year
newDF = top10List
def calc_perc(hashtag):
    newDF = top10List
    newDF[f'{hashtag}_perc'] =  newDF[hashtag]/newDF['CE'] *100
    newDF = newDF.drop(columns={f'{hashtag}'})
    return newDF

In [132]:
for tags in corpor10: 
    calc_perc(tags)

In [139]:
percCE = newDF.drop(columns={'CE','meta', 'adt', 'Kaart', 'amap', 'MSFTOpenMaps', 'tomtom', 'grab', 'DigitalEgypt', 'mapbox', 'Other'})

In [140]:
percCE = percCE.rename(columns = {'amap_perc': 'amap',
                                  'meta_perc': 'meta',
                                  'Kaart_perc':'Kaart',
                                  'tomtom_perc':'tomtom',
                                  'adt_perc': 'adt', 
                                  'grab_perc': 'grab',
                                  'bolt_perc':'bolt', 
                                  'MSFTOpenMaps_perc':'MSFTOpenMaps',
                                  'DigitalEgypt_perc':'DigitalEgypt',
                                  'Other_perc':'Other',
                                  'Mapbox_perc': 'mapbox'
                                 })

### percentage edits per corporation based on total edits
same as above but instead of the 'CE' column in totalCE now its the 'total' column in total edits

In [40]:
# calculating the total number of PFs per year - taking the df created for the first graph
newTotal = yearly_data.copy()

total_edits = newTotal[['year', 'total']]

In [41]:
# adding the total corporate numbers
top10List_new = pd.merge(total_edits, large, on = 'year', how ='left')

In [43]:
corpor10 = ['meta','adt', 'Kaart', 'amap', 'MSFTOpenMaps', 'tomtom', 'grab', 'DigitalEgypt', 'mapbox', 'Other']

In [None]:
# aadjusted for edits
newDF_edits = top10List_new
def calc_perc_total(hashtag):
    newDF_edits = top10List_new
    newDF_edits[f'{hashtag}_perc'] =  newDF_edits[hashtag]/newDF_edits['total'] *100
    newDF_edits = newDF_edits.drop(columns={f'{hashtag}'})
    return newDF_edits

In [45]:
for tags in corpor10: 
    calc_perc_total(tags)

In [47]:
percTotal = newDF_edits.drop(columns={'total','meta', 'adt', 'Kaart', 'amap', 'MSFTOpenMaps', 'tomtom', 'grab', 'DigitalEgypt', 'mapbox', 'Other'})

In [48]:
percTotal = percTotal.rename(columns = {'amap_perc': 'amap',
                                  'meta_perc': 'meta',
                                  'Kaart_perc':'Kaart',
                                  'tomtom_perc':'tomtom',
                                  'adt_perc': 'adt', 
                                  'grab_perc': 'grab',
                                  'bolt_perc':'bolt', 
                                  'MSFTOpenMaps_perc':'MSFTOpenMaps',
                                  'DigitalEgypt_perc':'DigitalEgypt',
                                  'Other_perc':'Other',
                                  'mapbox_perc': 'mapbox'
                                 })

In [50]:
percTotal.to_csv('fig1_top10_corpos_percentage_edits_based_on_total_edits.csv', header = True)