In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np 
import matplotlib.pyplot as plt 
import duckdb
import shapely.wkb
from shapely.wkb import loads

import glob
import os

## Data Download

In [2]:
#data files
db = duckdb.read_parquet(r"D:\Datenbank\ohsome-stats-yearly\year=*\month=*\*.parquet", hive_partitioning=1) 

In [3]:
hashtag_list = ['amap', 'adt', 'bolt', 'DigitalEgypt', 'expedia', 'gojek', 'MSFTOpenMaps', 'grab', 'Kaart', 'Kontur', 'mbx', 'RocketData',
                'disputed_by_claimed_by', 'Snapp', 'stackbox', 'Telenav', 'Lightcyphers', 'tomtom', 'TIDBO', 'WIGeoGIS-OMV', 'نشان',
                'mapbox', 'Komoot', 'AppLogica']

In [4]:
# Specify the directory where your Excel files are located
directory_path =  r"C:\Users\lilly\Documents\bachelorarbeit\analysis\UserNameID-v2"

# Get a list of all Excel files in the directory
excel_files = glob.glob(os.path.join(directory_path, "*.xls"))

# Create an empty dictionary to store DataFrames
dataframes_dict = {}

# Read each Excel file and store its DataFrame in the dictionary - files saved as xldr - otherwise when saved as csv, other method applicable
for excel_file in excel_files:
    filename = os.path.basename(excel_file)
    df = pd.read_csv(excel_file) 
    dataframes_dict[filename] = df

In [5]:
# get the user-ids in working format
def getListID(filename):
    df = dataframes_dict[filename]
    CorpoId = df['User ID']
    user_ids_str = ','.join([f"'{id}'" for id in CorpoId])
    return user_ids_str

Meta = getListID("MetaUser.xls")

In [6]:
def generate_ce_statement_meta(hashtags_list):
    # Construct the dynamic CE statement based on the list of hashtags
    ce_statement = " OR ".join([f"hashtags ILIKE '%{tag}%'" for tag in hashtags_list]) + f"OR user_id IN ({Meta})"
    return ce_statement

In [41]:
corporate_where = generate_ce_statement_meta(hashtag_list)

In [3]:
b3 = gpd.read_file(r"C:\Users\Lilly\Downloads\world-administrative-boundaries\world-administrative-boundaries.shp")

In [145]:
border = b3[['iso3', 'geometry']]
border = border.rename(columns={'iso3': 'ISO_A3'})

**TOTAL**

In [14]:
country_total = duckdb.sql("""
    SELECT country AS ISO_A3, COUNT(*) AS total, year, month
    FROM db
    WHERE year > 2018 
    GROUP BY ISO_A3, year, month

""")
country_total.show()
country_total_df = country_total.to_df()

┌─────────┬────────┬─────────┬─────────┐
│ ISO_A3  │ total  │  year   │  month  │
│ varchar │ int64  │ varchar │ varchar │
├─────────┼────────┼─────────┼─────────┤
│ RUS     │ 706627 │ 2019    │ 1       │
│ MRT     │  24870 │ 2019    │ 1       │
│ GMB     │   3885 │ 2019    │ 1       │
│ GIN     │ 122957 │ 2019    │ 1       │
│ JPN     │ 473778 │ 2019    │ 1       │
│ BRA     │ 488209 │ 2019    │ 1       │
│ COL     │  35008 │ 2019    │ 1       │
│ MEX     │ 183595 │ 2019    │ 1       │
│ IND     │ 231672 │ 2019    │ 1       │
│ KAZ     │  67453 │ 2019    │ 1       │
│  ·      │     ·  │  ·      │ ·       │
│  ·      │     ·  │  ·      │ ·       │
│  ·      │     ·  │  ·      │ ·       │
│ KIR     │   1902 │ 2019    │ 10      │
│ SSD     │   3340 │ 2019    │ 10      │
│ BHR     │    719 │ 2019    │ 10      │
│ IRQ     │  17355 │ 2019    │ 10      │
│ ROU     │  54999 │ 2019    │ 10      │
│ NER     │   5338 │ 2019    │ 10      │
│ MDA     │  29345 │ 2019    │ 10      │
│ NAM     │   88

In [15]:
df_total = country_total_df.copy()

In [16]:
df_total["time"] = df_total["year"].astype(str) + '-' + df_total["month"].astype(str) #creating a new column 'time', combining both month and year
# adjusting the format of the 'time' column to be able to adjust in the correct order with 'ascending'
df_total['time'] = pd.to_datetime(df_total['time'], format='%Y-%m') 
df_total['time'] = df_total['time'].dt.to_period('M')


In [19]:
df_total = df_total.drop(columns={'year', 'month'})

In [27]:
df_total = df_total.sort_values(by=['time'], ascending=True).loc[df_total['time']<'2023-07']

In [28]:
df_total = df_total.loc[df_total['time']>'2019-05']

In [30]:
total = df_total.copy()
total = total.drop(columns={'time'})

In [32]:
total = total.groupby('ISO_A3').sum()

In [39]:
total.sort_values(by ='total', ascending = False).head(20)

Unnamed: 0_level_0,total
ISO_A3,Unnamed: 1_level_1
USA,167025695
DEU,75883752
FRA,52754006
RUS,41587488
IDN,38793596
GBR,35714606
POL,34486362
BRA,31709726
JPN,26543286
IND,26146765


**CORPORATE**

In [50]:
corporate_country = duckdb.sql(f"""
    SELECT country AS ISO_A3, COUNT(*) AS total, year, month
    FROM db
    WHERE ({corporate_where}) AND year > 2018 
    GROUP BY ISO_A3, year, month

""")
corporate_country.show()
corporate_country_df = corporate_country.to_df()

┌─────────┬───────┬─────────┬─────────┐
│ ISO_A3  │ total │  year   │  month  │
│ varchar │ int64 │ varchar │ varchar │
├─────────┼───────┼─────────┼─────────┤
│ MNE     │     1 │ 2021    │ 8       │
│ SGP     │    54 │ 2021    │ 8       │
│ GNQ     │    13 │ 2021    │ 8       │
│ LBN     │     1 │ 2021    │ 8       │
│ SWZ     │     6 │ 2022    │ 1       │
│ TGO     │    11 │ 2021    │ 1       │
│ OMN     │    13 │ 2021    │ 3       │
│ HUN     │     1 │ 2021    │ 6       │
│ IRL     │     9 │ 2021    │ 7       │
│ TCD     │     1 │ 2021    │ 7       │
│  ·      │     · │  ·      │ ·       │
│  ·      │     · │  ·      │ ·       │
│  ·      │     · │  ·      │ ·       │
│ CHE     │     4 │ 2021    │ 2       │
│ IRL     │     7 │ 2021    │ 3       │
│ NAM     │     1 │ 2021    │ 3       │
│ MOZ     │    47 │ 2021    │ 4       │
│ KNA     │    48 │ 2021    │ 4       │
│ LBY     │    10 │ 2021    │ 5       │
│ BMU     │     3 │ 2021    │ 5       │
│ SYC     │     9 │ 2021    │ 6       │


In [57]:
df_corporate = corporate_country_df.copy()

In [58]:
df_corporate["time"] = df_corporate["year"].astype(str) + '-' + df_corporate["month"].astype(str) #creating a new column 'time', combining both month and year
# adjusting the format of the 'time' column to be able to adjust in the correct order with 'ascending'
df_corporate['time'] = pd.to_datetime(df_corporate['time'], format='%Y-%m') 
df_corporate['time'] = df_corporate['time'].dt.to_period('M')

In [59]:
df_corporate = df_corporate.drop(columns={'year', 'month'})

In [60]:
df_corporate = df_corporate.sort_values(by=['time'], ascending=True).loc[df_corporate['time']<'2023-07']

In [63]:
df_corporate = df_corporate.loc[df_corporate['time']>'2019-05']

In [64]:
corporate = df_corporate.copy()
corporate = corporate.drop(columns={'time'})

In [65]:
corporate = corporate.groupby('ISO_A3').sum()

In [68]:
corporate.sort_values(by ='total', ascending = False).head(20)

Unnamed: 0_level_0,total
ISO_A3,Unnamed: 1_level_1
IDN,13883393
MEX,12915669
IND,10995041
BRA,10129313
USA,6723148
TZA,4917891
VNM,4077101
PHL,3729634
COL,3002557
CHL,2798311


In [69]:
corporate = corporate.rename(columns={'total':'corporate'})

**GeoJSON**

In [111]:
combi = pd.merge(total, corporate, on = 'ISO_A3', how = 'left')
combi['percentage'] = (combi['corporate'] / combi['total'])*100

In [112]:
combi = combi.reset_index()

In [117]:
combi.to_csv('t0_t1_total_corporate_edits.csv', header = True)

In [146]:
gdf = border.merge(combi, on = 'ISO_A3', how = 'left')

In [148]:
gdf.to_file('corporate_edits_period_t0_t1_v3.geojson', driver="GeoJSON")