In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np 
import matplotlib.pyplot as plt 
import duckdb
import shapely.wkb
from shapely.wkb import loads

import glob
import os

In [2]:
#data files
db = duckdb.read_parquet(r"D:\Datenbank\ohsome-stats-yearly\year=*\month=*\*.parquet", hive_partitioning=1) 

In [3]:
hashtag_list = ['amap', 'adt', 'bolt', 'DigitalEgypt', 'expedia', 'gojek', 'MSFTOpenMaps', 'grab', 'Kaart', 'Kontur', 'mbx', 'RocketData',
                'disputed_by_claimed_by', 'Snapp', 'stackbox', 'Telenav', 'Lightcyphers', 'tomtom', 'TIDBO', 'WIGeoGIS-OMV', 'نشان',
                'mapbox', 'Komoot', 'AppLogica']

In [4]:
# Specify the directory where your Excel files are located
directory_path =  r"C:\Users\lilly\Documents\bachelorarbeit\analysis\UserNameID-v2"

# Get a list of all Excel files in the directory
excel_files = glob.glob(os.path.join(directory_path, "*.xls"))

# Create an empty dictionary to store DataFrames
dataframes_dict = {}

# Read each Excel file and store its DataFrame in the dictionary - files saved as xldr - otherwise when saved as csv, other method applicable
for excel_file in excel_files:
    filename = os.path.basename(excel_file)
    df = pd.read_csv(excel_file) 
    dataframes_dict[filename] = df

In [5]:
# get the user-ids in working format
def getListID(filename):
    df = dataframes_dict[filename]
    CorpoId = df['User ID']
    user_ids_str = ','.join([f"'{id}'" for id in CorpoId])
    return user_ids_str

Meta = getListID("MetaUser.xls")

In [6]:
def generate_ce_statement_meta(hashtags_list):
    # Construct the dynamic CE statement based on the list of hashtags
    ce_statement = " OR ".join([f"hashtags ILIKE '%{tag}%'" for tag in hashtags_list]) + f"OR user_id IN ({Meta})"
    return ce_statement

In [7]:
corporate_where = generate_ce_statement_meta(hashtag_list)

In [2]:
b3 = gpd.read_file(r"C:\Users\Lilly\Downloads\world-administrative-boundaries\world-administrative-boundaries.shp")

In [35]:
border = b3[['iso3', 'geometry']]
border = border.rename(columns={'iso3': 'ISO_A3'})

**TOTAL**

In [14]:
country_total = duckdb.sql("""
    SELECT country AS ISO_A3, COUNT(*) AS total, year, month
    FROM db
    WHERE year > 2018 
    GROUP BY ISO_A3, year, month

""")
country_total.show()
country_total_df = country_total.to_df()

┌─────────┬────────┬─────────┬─────────┐
│ ISO_A3  │ total  │  year   │  month  │
│ varchar │ int64  │ varchar │ varchar │
├─────────┼────────┼─────────┼─────────┤
│ ZMB     │ 112133 │ 2022    │ 3       │
│ LAO     │   2317 │ 2022    │ 3       │
│ BOL     │  43184 │ 2022    │ 3       │
│ MDV     │   2610 │ 2022    │ 3       │
│ WSM     │    170 │ 2022    │ 3       │
│ GIN     │   6439 │ 2022    │ 3       │
│ LKA     │   8386 │ 2022    │ 3       │
│ SLE     │  34366 │ 2022    │ 3       │
│ NOR     │ 137800 │ 2022    │ 4       │
│ SVN     │  46407 │ 2022    │ 4       │
│  ·      │      · │  ·      │ ·       │
│  ·      │      · │  ·      │ ·       │
│  ·      │      · │  ·      │ ·       │
│ COM     │     46 │ 2021    │ 5       │
│ LKA     │  59289 │ 2022    │ 11      │
│ HND     │   1855 │ 2022    │ 12      │
│ HRV     │  37977 │ 2022    │ 2       │
│ BGR     │  14140 │ 2020    │ 7       │
│ CAN     │ 294966 │ 2021    │ 8       │
│ SLE     │   3820 │ 2022    │ 2       │
│ GRC     │ 4808

In [15]:
country_total_df.to_csv('fig2_absolute_country_edits_monthly_raw.csv', header = True)

In [4]:
country_total_df = pd.read_csv('fig2_absolute_country_edits_monthly_raw.csv')

In [6]:
country_total_df = country_total_df.drop(columns='Unnamed: 0')

In [7]:
df_total = country_total_df.copy()

In [8]:
df_total["time"] = df_total["year"].astype(str) + '-' + df_total["month"].astype(str) #creating a new column 'time', combining both month and year
# adjusting the format of the 'time' column to be able to adjust in the correct order with 'ascending'
df_total['time'] = pd.to_datetime(df_total['time'], format='%Y-%m') 
df_total['time'] = df_total['time'].dt.to_period('M')


In [9]:
df_total = df_total.drop(columns={'year', 'month'})

In [10]:
df_total = df_total.sort_values(by=['time'], ascending=True).loc[df_total['time']<'2023-07']

In [11]:
df_total = df_total.loc[df_total['time']>'2019-05']

In [12]:
total = df_total.copy()
total = total.drop(columns={'time'})

In [13]:
total = total.groupby('ISO_A3').sum()

In [14]:
total.sort_values(by ='total', ascending = False).head(20)

Unnamed: 0_level_0,total
ISO_A3,Unnamed: 1_level_1
USA,167025695
DEU,75883752
FRA,52754006
RUS,41587488
IDN,38793596
GBR,35714606
POL,34486362
BRA,31709726
JPN,26543286
IND,26146765


**CORPORATE**

In [10]:
corporate_country = duckdb.sql(f"""
    SELECT country AS ISO_A3, COUNT(*) AS total, year, month
    FROM db
    WHERE ({corporate_where}) AND year > 2018 
    GROUP BY ISO_A3, year, month

""")
corporate_country.show()
corporate_country_df = corporate_country.to_df()

┌─────────┬───────┬─────────┬─────────┐
│ ISO_A3  │ total │  year   │  month  │
│ varchar │ int64 │ varchar │ varchar │
├─────────┼───────┼─────────┼─────────┤
│ GMB     │    73 │ 2022    │ 1       │
│ ALB     │   865 │ 2022    │ 1       │
│ CIV     │   417 │ 2022    │ 1       │
│ SLE     │    50 │ 2022    │ 1       │
│ NER     │    24 │ 2022    │ 1       │
│ LBN     │  4112 │ 2022    │ 1       │
│ MSR     │     7 │ 2022    │ 1       │
│ FLK     │     1 │ 2022    │ 1       │
│ KHM     │   226 │ 2022    │ 1       │
│ MOZ     │   149 │ 2022    │ 1       │
│  ·      │     · │  ·      │ ·       │
│  ·      │     · │  ·      │ ·       │
│  ·      │     · │  ·      │ ·       │
│ GUY     │    32 │ 2023    │ 6       │
│ BTN     │    12 │ 2023    │ 6       │
│ STP     │     3 │ 2023    │ 8       │
│ VUT     │     3 │ 2023    │ 5       │
│ VGB     │     4 │ 2023    │ 8       │
│ STP     │     1 │ 2023    │ 4       │
│ MUS     │    25 │ 2023    │ 2       │
│ MDV     │     9 │ 2023    │ 4       │


In [11]:
corporate_country_df.to_csv('fig2_corporate_country_edits_monthly_raw.csv', header = True)

In [17]:
corporate_country_df = pd.read_csv('fig2_corporate_country_edits_monthly_raw.csv')
corporate_country_df = corporate_country_df.drop(columns={'Unnamed: 0'})

In [18]:
df_corporate = corporate_country_df.copy()

In [19]:
df_corporate["time"] = df_corporate["year"].astype(str) + '-' + df_corporate["month"].astype(str) #creating a new column 'time', combining both month and year
# adjusting the format of the 'time' column to be able to adjust in the correct order with 'ascending'
df_corporate['time'] = pd.to_datetime(df_corporate['time'], format='%Y-%m') 
df_corporate['time'] = df_corporate['time'].dt.to_period('M')


In [20]:
df_corporate = df_corporate.drop(columns={'year', 'month'})

In [21]:
df_corporate = df_corporate.sort_values(by=['time'], ascending=True).loc[df_corporate['time']<'2023-07']

In [23]:
df_corporate = df_corporate.loc[df_corporate['time']>'2019-05']

In [24]:
df_corporate

Unnamed: 0,ISO_A3,total,time
8368,TKM,390,2019-06
6106,BRB,22,2019-06
8367,UGA,6,2019-06
8370,NZL,9,2019-06
7787,MLT,1,2019-06
...,...,...,...
7409,NZL,165,2023-06
6919,TCD,114,2023-06
6918,KHM,2708,2023-06
6917,NER,135,2023-06


In [25]:
corporate = df_corporate.copy()
corporate = corporate.drop(columns={'time'})

In [26]:
corporate = corporate.groupby('ISO_A3').sum()

In [27]:
corporate.sort_values(by ='total', ascending = False).head(20)

Unnamed: 0_level_0,total
ISO_A3,Unnamed: 1_level_1
IDN,13883393
MEX,12915669
IND,10995041
BRA,10129313
USA,6723148
TZA,4917891
VNM,4077101
PHL,3729634
COL,3002557
CHL,2798311


In [28]:
corporate = corporate.rename(columns={'total':'corporate'})


**GeoJSON**

In [29]:
combi = pd.merge(total, corporate, on = 'ISO_A3', how = 'left')
combi['percentage'] = (combi['corporate'] / combi['total'])*100

In [30]:
combi = combi.reset_index()

In [32]:
combi.sort_values(by = 'percentage', ascending = False)

Unnamed: 0,ISO_A3,total,corporate,percentage
18,BHR,161936,136129.0,84.063457
200,URY,1023256,743692.0,72.678978
123,MEX,18922786,12915669.0,68.254585
159,QAT,624800,376576.0,60.271447
40,COL,5831297,3002557.0,51.490380
...,...,...,...,...
155,POL,34486362,17094.0,0.049567
116,LUX,994546,160.0,0.016088
172,SMR,55003,6.0,0.010908
119,MCO,12178,1.0,0.008212


In [117]:
combi.to_csv('fig2_t0_t1_total_corporate_edits.csv', header = True)

In [36]:
gdf = border.merge(combi, on = 'ISO_A3', how = 'left')

In [37]:
gdf

Unnamed: 0,ISO_A3,geometry,total,corporate,percentage
0,UGA,"POLYGON ((33.92110 -1.00194, 33.92027 -1.00111...",10707841.0,107930.0,1.007953
1,UZB,"POLYGON ((70.97081 42.25467, 70.98054 42.26205...",3332947.0,177514.0,5.326037
2,IRL,"MULTIPOLYGON (((-9.97014 54.02083, -9.93833 53...",5342883.0,14587.0,0.273017
3,ERI,"MULTIPOLYGON (((40.13583 15.75250, 40.12861 15...",157529.0,1431.0,0.908404
4,,"MULTIPOLYGON (((-26.24361 -58.49473, -26.24889...",,,
...,...,...,...,...,...
251,BEL,"POLYGON ((6.01180 50.75727, 6.05472 50.72361, ...",10140149.0,7072.0,0.069743
252,WSM,"MULTIPOLYGON (((-171.42920 -14.01625, -171.441...",53139.0,964.0,1.814110
253,AIA,"POLYGON ((-63.15375 18.16528, -63.16778 18.164...",18326.0,764.0,4.168940
254,ISR,"POLYGON ((35.62364 33.24573, 35.63249 33.24637...",1505869.0,4464.0,0.296440


In [38]:
gdf.to_file('fig2_corporate_edits_period_t0_t1.geojson', driver="GeoJSON")

In [39]:
t = pd.read_csv("fig2_t0_t1_total_corporate_edits.csv")

In [44]:
t.sort_values(by = 'percentage', ascending = False).sum()

Unnamed: 0                                                23005
ISO_A3        BHRURYMEXQATCOLCHLMYSVNMAREDOMINDGRCIDNSWZOMNK...
total                                                1111799727
corporate                                           100501914.0
percentage                                          1853.314679
dtype: object

In [46]:
(78204569 / 100501914) * 100

77.81400959189693