In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np 
import matplotlib.pyplot as plt 
import duckdb
import seaborn as sns
import statsmodels.api as sm

import glob
import os

In [19]:
idn = duckdb.read_parquet(r"D:\corporate_extended\IDN_data\IDN_data\h3_cell_id=*\*.parquet", hive_partitioning=1)

col = duckdb.read_parquet(r"D:\corporate_extended\COL_data\COL_data\h3_cell_id=*\*data_0.parquet", hive_partitioning=1)

are = duckdb.read_parquet(r"C:\Users\Lilly\Downloads\ARE_data\ARE_data\h3_cell_id=*\*.parquet", hive_partitioning=1)

In [3]:
# All corporate Hashtags
hashtag_list = ['amap', 'adt', 'bolt', 'DigitalEgypt', 'expedia', 'gojek', 'MSFTOpenMaps', 'grab', 'Kaart', 'Kontur', 'mbx', 'RocketData',
                'disputed_by_claimed_by', 'Snapp', 'stackbox', 'Telenav', 'Lightcyphers', 'tomtom', 'TIDBO', 'WIGeoGIS-OMV', 'نشان',
                'mapbox', 'Komoot', 'AppLogica']

In [8]:
# get the user-ids in working format
def getListID(filename):
    df = dataframes_dict[filename]
    CorpoId = df['User ID']
    user_ids_str = ','.join([f"'{id}'" for id in CorpoId])
    return user_ids_str

In [9]:
# Specify the directory where your Excel files are located
directory_path =  r"C:\Users\lilly\Documents\bachelorarbeit\analysis\UserNameID-v2"

# Get a list of all Excel files in the directory
excel_files = glob.glob(os.path.join(directory_path, "*.xls"))

# Create an empty dictionary to store DataFrames
dataframes_dict = {}

# Read each Excel file and store its DataFrame in the dictionary - files saved as xldr - otherwise when saved as csv, other method applicable
for excel_file in excel_files:
    filename = os.path.basename(excel_file)
    df = pd.read_csv(excel_file) 
    dataframes_dict[filename] = df

Meta = getListID("MetaUser.xls")

In [2]:
def generate_ce_statement_meta(hashtags_list):
    # Construct the dynamic CE statement based on the list of hashtags including the user ids form Meta
    ce_statement = " OR ".join([f"hashtags ILIKE '%{tag}%'" for tag in hashtags_list]) + f"OR user_id IN ({Meta})"
    return ce_statement

def generate_nonce_statement_meta(hashtags_list): 
    # Construct the dynamic nonCE statement based on the list of hashtags including the user ids form Meta
    non_ce_statement = " AND ".join([f"hashtags NOT ILIKE '%{tag}%'" for tag in hashtags_list]) + f"AND user_id NOT IN ({Meta})"
    return non_ce_statement

In [12]:
meta_where_corpo = generate_ce_statement_meta(hashtag_list)

meta_where_non_corpo = generate_nonce_statement_meta(hashtag_list)

In [5]:
def create_string(df):
    hex_id_str = ','.join([f"'{id}'" for id in df['h3_cell_id']])
    return hex_id_str

In [6]:
# function to get a combined time column 
def addTime(df):

    df["time"] = df["year"].astype(str) + '-' + df["month"].astype(str)
    df['time'] = pd.to_datetime(df['time'], format='%Y-%m')
    
    df['time'] = df['time'].dt.to_period('M')

    dfYear = df.drop(columns=["year", "month"])
    
    return dfYear

def Edits_monthly(db, where_statement, corporate_list, new_column_name):
    # Edits either corporate or non-corporate depending on the where statement
    monthly = f"""
        SELECT year, month, COUNT(*) AS totalEdits, h3_cell_id AS name
        FROM {db}
        WHERE ({where_statement}) AND h3_cell_id IN ({corporate_list}) AND year > 2014
        GROUP BY month, year, h3_cell_id
        ORDER BY year, month ASC
    """

    result = duckdb.sql(monthly)
    
    df = result.to_df()

    mergedYear = addTime(df)
    
    mergedYear = mergedYear.rename(columns={'totalEdits': new_column_name})
    
    
    return mergedYear

# combined table for the corrected timeframe - f first, m middle, l last
def df_timeframes_merged(db, CEwhere_statment, nonCEwhere_statment, corporate_list, f, m, l):

    CE_edits = Edits_monthly(db, CEwhere_statment, corporate_list, 'CE')
    nonCE_edits = Edits_monthly(db, nonCEwhere_statment, corporate_list, 'NCE')

    total_tf = pd.merge(nonCE_edits, CE_edits, how="outer", on=['name', 'time'])

    tf_total = total_tf.loc[(total_tf['time'] >= f) & (total_tf['time'] <= l)].reset_index(drop=True)

    tf_pre = total_tf.loc[(total_tf['time'] >= f) & (total_tf['time'] <= m)].reset_index(drop=True)
    tf_post = total_tf.loc[(total_tf['time'] > m) & (total_tf['time'] <= l)].reset_index(drop=True)  
    

    return tf_total, tf_pre, tf_post

# calculating mean and median values 
def calculating_columns(calc_type, df, name_column, new_column_name):
    if calc_type == 'mean':
        result = df[[name_column, 'name']].groupby('name').mean().reset_index()
        result = result.rename(columns={name_column: new_column_name})
        return result

    elif calc_type == 'median':
        result = df[[name_column, 'name']].groupby('name').median().reset_index()
        result = result.rename(columns={name_column: new_column_name})
        return result

    else:
        print('error: neither mean nor median as input')


# combined table for the mean and median values 
def calculate_median_mean_combitable(db, CEwhere_statment, nonCEwhere_statment, corporate_list, f, m, l):

    df_total, df_pre, df_post = df_timeframes_merged(db, CEwhere_statment, nonCEwhere_statment, corporate_list, f, m, l)

    # non-corporate median for t0
    pre_corpo_nc_median = calculating_columns('median', df_pre, 'NCE', 't0_nc_median')
    
    # non-corporate median for t1
    post_corpo_nc_median = calculating_columns('median', df_post, 'NCE', 't1_nc_median')

    #corporate median for t0
    pre_corpo_c_median = calculating_columns('median', df_pre, 'CE', 't0_c_median')

    # corporate median for t1
    post_corpo_c_median = calculating_columns('median', df_post, 'CE', 't1_c_median')


    # non-corporate mean for t0
    pre_corpo_nc_mean = calculating_columns('mean', df_pre, 'NCE', 't0_nc_mean')
    
    # non-corporate mean for t1
    post_corpo_nc_mean = calculating_columns('mean', df_post, 'NCE', 't1_nc_mean')

    #corporate mean for t0
    pre_corpo_c_mean = calculating_columns('mean', df_pre, 'CE', 't0_c_mean')

    # corporate mean for t1
    post_corpo_c_mean = calculating_columns('mean', df_post, 'CE', 't1_c_mean')


    merged_table = pd.merge(pre_corpo_nc_median, post_corpo_nc_median, on = 'name', how = 'outer')
    merged_table = pd.merge(merged_table, pre_corpo_c_median, on = 'name', how = 'outer')
    merged_table = pd.merge(merged_table, post_corpo_c_median, on = 'name', how = 'outer')
    merged_table = pd.merge(merged_table, pre_corpo_nc_mean, on = 'name', how = 'outer')
    merged_table = pd.merge(merged_table, post_corpo_nc_mean, on = 'name', how = 'outer')
    merged_table = pd.merge(merged_table, pre_corpo_c_mean, on = 'name', how = 'outer')
    merged_table = pd.merge(merged_table, post_corpo_c_mean, on = 'name', how = 'outer')

    return merged_table

In [7]:
#the timeframe
first = str('06-2019')
last = str('05-2023')
middle = str('05-2021')

### United Arab Emirates

In [14]:
corpolist_are = duckdb.sql(f"""
    SELECT distinct h3_cell_id
    FROM are
    WHERE ({meta_where_corpo})

""")

corpolist_are.fetchall()

corpolist_are = corpolist_are.to_df()
cl_are = corpolist_are.dropna()
a_corporate_list = cl_are['h3_cell_id'].tolist()

# workable lists for the cell ids with corporate edits
corporate_list_are = create_string(cl_are)

In [None]:
are_meta = calculate_median_mean_combitable('are',meta_where_corpo, meta_where_non_corpo,corporate_list_are, first, middle, last)

In [16]:
are_meta.to_csv('are_mean_median_meta_hex_v2.csv')

In [17]:
are_meta

Unnamed: 0,name,t0_nc_median,t1_nc_median,t0_c_median,t1_c_median,t0_nc_mean,t1_nc_mean,t0_c_mean,t1_c_mean
0,86438415fffffff,1.0,1.0,1.0,2.5,1.0,1.0,1.0,2.500000
1,8643841afffffff,,,1.0,,,,1.0,
2,86438442fffffff,1.0,,1.0,,1.0,,1.0,
3,864384507ffffff,3.0,,2.0,,3.0,,2.0,
4,864384537ffffff,1.0,,1.0,,1.0,,1.0,
...,...,...,...,...,...,...,...,...,...
1510,86534dcc7ffffff,,4.0,,5.0,,4.0,,5.000000
1511,86534dcdfffffff,,1.0,,5.0,,9.0,,12.666667
1512,86534dce7ffffff,,,,1.0,,,,1.000000
1513,86534dcf7ffffff,,1.0,,1.0,,1.0,,8.333333


### Colombia

In [20]:
corpolist_col = duckdb.sql(f"""
    SELECT distinct h3_cell_id
    FROM col
    WHERE ({meta_where_corpo})

""")

corpolist_col.fetchall()

corpolist_col = corpolist_col.to_df()
cl_col = corpolist_col.dropna()
a_corporate_list = cl_col['h3_cell_id'].tolist()

# workable lists for the cell ids with corporate edits
corporate_list_col = create_string(cl_col)

In [21]:
col_df = corpolist_col.copy()

In [22]:
# to run as batch process
n = 2000  #chunk row size
list_df_col = [col_df[i:i+n] for i in range(0,col_df.shape[0],n)]

In [23]:
col_corpo_list_1 = create_string(list_df_col[0].dropna())
col_corpo_list_2 = create_string(list_df_col[1].dropna())
col_corpo_list_3 = create_string(list_df_col[2].dropna())
col_corpo_list_4 = create_string(list_df_col[3].dropna())
col_corpo_list_5 = create_string(list_df_col[4].dropna())
col_corpo_list_6 = create_string(list_df_col[5].dropna())
col_corpo_list_7 = create_string(list_df_col[6].dropna())

In [27]:
c1 = calculate_median_mean_combitable('col', meta_where_corpo, meta_where_non_corpo, col_corpo_list_1, first, middle, last)

In [28]:
c2 = calculate_median_mean_combitable('col', meta_where_corpo, meta_where_non_corpo, col_corpo_list_2, first, middle, last)

In [29]:
c3 = calculate_median_mean_combitable('col', meta_where_corpo, meta_where_non_corpo, col_corpo_list_3, first, middle, last)

In [30]:
c4 = calculate_median_mean_combitable('col', meta_where_corpo, meta_where_non_corpo, col_corpo_list_4, first, middle, last)

In [31]:
c5 = calculate_median_mean_combitable('col', meta_where_corpo, meta_where_non_corpo, col_corpo_list_5, first, middle, last)

In [32]:
c6 = calculate_median_mean_combitable('col', meta_where_corpo, meta_where_non_corpo, col_corpo_list_6, first, middle, last)

In [33]:
c7 = calculate_median_mean_combitable('col', meta_where_corpo, meta_where_non_corpo, col_corpo_list_7, first, middle, last)

In [34]:
frames = [c1, c2, c3, c4, c5, c6, c7]

col_meta_result = pd.concat(frames)
col_meta_result.to_csv("col_mean_median_meta_hex_v2.csv", header = True)

### Indonesia

In [36]:
corpolist_idn = duckdb.sql(f"""
    SELECT distinct h3_cell_id
    FROM idn
    WHERE ({meta_where_corpo})

""")

corpolist_idn.fetchall()

corpolist_idn = corpolist_idn.to_df()
cl_idn = corpolist_idn.dropna()
a_corporate_list = cl_idn['h3_cell_id'].tolist()

# workable lists for the cell ids with corporate edits
corporate_list_idn = create_string(cl_idn)

In [37]:
df_idn = corpolist_idn.copy()

In [41]:
# to run as batch process
n = 2000  #chunk row size
list_df_idn = [df_idn[i:i+n] for i in range(0,df_idn.shape[0],n)]

In [42]:
idn_corpo_list_1 = create_string(list_df_idn[0].dropna())
idn_corpo_list_2 = create_string(list_df_idn[1].dropna())
idn_corpo_list_3 = create_string(list_df_idn[2].dropna())
idn_corpo_list_4 = create_string(list_df_idn[3].dropna())
idn_corpo_list_5 = create_string(list_df_idn[4].dropna())
idn_corpo_list_6 = create_string(list_df_idn[5].dropna())
idn_corpo_list_7 = create_string(list_df_idn[6].dropna())
idn_corpo_list_8 = create_string(list_df_idn[7].dropna())
idn_corpo_list_9 = create_string(list_df_idn[8].dropna())
idn_corpo_list_10 = create_string(list_df_idn[9].dropna())
idn_corpo_list_11 = create_string(list_df_idn[10].dropna())
idn_corpo_list_12 = create_string(list_df_idn[11].dropna())
idn_corpo_list_13 = create_string(list_df_idn[12].dropna())

In [43]:
i1 = calculate_median_mean_combitable('idn', meta_where_corpo, meta_where_non_corpo, idn_corpo_list_1, first, middle, last)

In [44]:
i2 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_2, first, middle, last)

In [45]:
i3 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_3, first, middle, last)

In [46]:
i4 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_4, first, middle, last)

In [47]:
i5 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_5, first, middle, last)

In [48]:
i6 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_6, first, middle, last)
i7 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_7, first, middle, last)
i8 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_8, first, middle, last)
i9 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_9, first, middle, last)
i10 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_10, first, middle, last)
i11 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_11, first, middle, last)
i12 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_12, first, middle, last)
i13 = calculate_median_mean_combitable('idn',meta_where_corpo, meta_where_non_corpo, idn_corpo_list_13, first, middle, last)

In [49]:
frames = [i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13]

idn_result = pd.concat(frames)
idn_result.to_csv("idn_combi_table_result_meta_v2.csv", header = True)