### 1) Hall-Tideman Index Calculation

In [1]:
def calculate_HTI(shares):
    N = len(shares) # num_firms
    # sort tag shares in descending order
    sorted_shares = sorted(shares, reverse = True)
    # calculate summation part of the denominator
    sum_part = sum(sorted_shares[i] * (N-(i+1)) for i in range(N))
    # calculate HTI
    HTI = 1/(N+sum_part)
    return HTI

In [2]:
# Example usage
market_shares = [0.40, 0.30, 0.20, 0.10]
hti = calculate_HTI(market_shares)
print(f"Hall-Tideman Index: {hti:.4f}")


Hall-Tideman Index: 0.1667


In [3]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
from nltk import FreqDist
import pickle
import math
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
with open('/data1/StackOverflow/Tag_Analysis/df_tags_2023.pickle', 'rb') as fr:
    df_tags = pickle.load(fr)
def wc(text):
    """
    Cleaning function to be used with our first wordcloud
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('-','')
        tags = tags.replace('.','DOT')
        tags = tags.replace('c++','Cpp')
        tags = tags.replace('c#','Csharp')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def clean_tags(text):
    """
    Cleaning function for tags
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def tag_freq(data):
    tags = data['tags'].str.replace('[\["\]]', '', regex=True)
    tags = [tag for i in tags.apply(lambda x: wc(x)) for tag in i.split(', ')]
    result = FreqDist(tags)
    return result
# Preprocessing
df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
df_tags['year_month'] = df_tags['creation_date'].dt.to_period('D')
df_tags['year_month'] = df_tags['year_month'].astype(str)
year_month = df_tags.year_month.unique()

In [4]:
# load pickle
with open('/data1/StackOverflow/diff_in_diff/daily_tagShare_modified.pickle', 'rb') as fr:
    all_keys = pickle.load(fr)

In [5]:
all_keys

Unnamed: 0,index,tag,tag_2020_11_30,tagShare_2020_11_30,tag_2020_12_01,tagShare_2020_12_01,tag_2020_12_02,tagShare_2020_12_02,tag_2020_12_03,tagShare_2020_12_03,...,tag_2023_08_30,tagShare_2023_08_30,tag_2023_08_31,tagShare_2023_08_31,tag_2023_09_01,tagShare_2023_09_01,tag_2023_09_02,tagShare_2023_09_02,tag_2023_09_03,tagShare_2023_09_03
0,amazonwebservices,52132,47.0,0.315352,59.0,0.368819,57.0,0.352505,86.0,0.493657,...,47.0,0.409087,57.0,0.503890,38.0,0.389744,25.0,0.41625,13.0,0.689655
1,forloop,16939,19.0,0.127483,21.0,0.131275,27.0,0.166976,32.0,0.183686,...,11.0,0.095744,4.0,0.035361,4.0,0.041026,3.0,0.04995,3.0,0.159151
2,foreach,3614,9.0,0.060386,5.0,0.031256,3.0,0.018553,6.0,0.034441,...,3.0,0.026112,2.0,0.017680,1.0,0.010256,1.0,0.01665,,
3,terraform,11491,13.0,0.087225,12.0,0.075014,10.0,0.061843,8.0,0.045922,...,13.0,0.113152,17.0,0.150283,13.0,0.133333,2.0,0.03330,,
4,awsglue,2203,3.0,0.020129,5.0,0.031256,2.0,0.012369,1.0,0.005740,...,2.0,0.017408,,,2.0,0.020513,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51948,applespeech,1,,,,,,,,,...,,,,,,,,,1.0,0.053050
51949,jsonloader,1,,,,,,,,,...,,,,,,,,,1.0,0.053050
51950,livescript,1,,,,,,,,,...,,,,,,,,,1.0,0.053050
51951,nxhtml,1,,,,,,,,,...,,,,,,,,,1.0,0.053050


In [6]:
arr = all_keys.iloc[:,2]
arr[~np.isnan(arr)]

0       47.0
1       19.0
2        9.0
3       13.0
4        3.0
        ... 
3721     1.0
3722     1.0
3723     1.0
3724     1.0
3725     1.0
Name: tag_2020_11_30, Length: 3726, dtype: float64

In [7]:
HTI_Score = []
for i in range(3, all_keys.shape[1], 2):
    arr = all_keys.iloc[:,i]
    arr = arr[~np.isnan(arr)]
    HTI_Score.append(calculate_HTI(arr))
result_HTI = pd.DataFrame({'year_month':year_month, 'HTI_Score':HTI_Score})

In [8]:
result_HTI

Unnamed: 0,year_month,HTI_Score
0,2020-11-30,0.000003
1,2020-12-01,0.000003
2,2020-12-02,0.000003
3,2020-12-03,0.000003
4,2020-12-04,0.000003
...,...,...
1003,2023-08-30,0.000003
1004,2023-08-31,0.000003
1005,2023-09-01,0.000004
1006,2023-09-02,0.000006


In [9]:
entropy = list(result_HTI[(result_HTI['year_month'] > '2021-08-31') &
           (result_HTI['year_month'] < '2023-09-01')].reset_index().HTI_Score) # fixed datetime
# Split Data
control_data = pd.DataFrame({'HHI' : entropy[:365],
              'T_d': [0]*len(entropy[:365]),
              'P_t' : [0]*90 + [1]*275})
treated_data = pd.DataFrame({'HHI' : entropy[365:],
              'T_d': [1]*len(entropy[365:]),
              'P_t' : [0]*90 + [1]*275})
df_did = pd.concat([control_data, treated_data], axis = 0).reset_index(drop = True)
# Add date and month feature
df_did['date'] = result_HTI[(result_HTI['year_month'] > '2021-08-31') &
           (result_HTI['year_month'] < '2023-09-01')].reset_index().year_month
df_did['month'] = pd.to_datetime(df_did['date']).dt.month
# Apply log
df_did['ln_y'] = np.log(df_did['HHI'])

In [12]:
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(month)', df_did).fit(cov_type='HC3').summary()

0,1,2,3
Dep. Variable:,ln_y,R-squared:,0.088
Model:,OLS,Adj. R-squared:,0.07
Method:,Least Squares,F-statistic:,4.944
Date:,"Mon, 20 May 2024",Prob (F-statistic):,7.84e-09
Time:,13:34:32,Log-Likelihood:,67.552
No. Observations:,730,AIC:,-105.1
Df Residuals:,715,BIC:,-36.21
Df Model:,14,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-12.3150,0.136,-90.644,0.000,-12.581,-12.049
C(month)[T.2],-0.0306,0.041,-0.748,0.454,-0.111,0.049
C(month)[T.3],-0.0005,0.039,-0.013,0.990,-0.078,0.077
C(month)[T.4],0.0878,0.041,2.144,0.032,0.008,0.168
C(month)[T.5],0.1103,0.041,2.715,0.007,0.031,0.190
C(month)[T.6],0.1019,0.042,2.436,0.015,0.020,0.184
C(month)[T.7],0.1394,0.043,3.255,0.001,0.055,0.223
C(month)[T.8],0.0982,0.040,2.433,0.015,0.019,0.177
C(month)[T.9],-0.1621,0.138,-1.176,0.239,-0.432,0.108

0,1,2,3
Omnibus:,164.843,Durbin-Watson:,1.108
Prob(Omnibus):,0.0,Jarque-Bera (JB):,108.171
Skew:,0.825,Prob(JB):,3.24e-24
Kurtosis:,2.087,Cond. No.,65.2


### 2) Gini Coefficient

In [18]:
def calculate_gini(shares):
    shares = sorted(shares)
    N = len(shares)
    total_sum = sum(shares)
    abs_diffs = 0
    for i in range(N):
        for j in range(N):
            abs_diffs += abs(shares[i]-shares[j])
    gini_coeff = abs_diffs / (2 * N * total_sum)
    return gini_coeff

In [20]:
# Example usage
market_shares = [0.25, 0.25, 0.25, 0.25]
#market_shares = sorted(market_shares)
gini = calculate_gini(market_shares)
print(f"Gini Coefficient: {gini:.4f}")


Gini Coefficient: 0.0000


In [71]:
arr = all_keys.iloc[:,3]
arr = arr[~np.isnan(arr)]

In [72]:
calculate_gini(arr)

0.679572668108657

In [77]:
sum(arr)

99.99999999999565

In [90]:
all_keys.iloc[:,3].sum()

100.0

In [89]:
all_keys.iloc[:,5].sum()

100.00000000000001

In [91]:
all_keys.iloc[:,7].sum()

100.0

In [93]:
arr = all_keys.iloc[:,7]
arr = arr[~np.isnan(arr)]
calculate_gini(arr)

KeyError: 19

In [68]:
Gini_coeff = []
for i in range(3, all_keys.shape[1], 2):
    arr = all_keys.iloc[:,i]
    arr = arr[~np.isnan(arr)]
    Gini_coeff.append(calculate_gini(arr))
result_coeff = pd.DataFrame({'year_month':year_month, 'Gini_coeff':Gini_coeff})

KeyError: 19

In [None]:
entropy = list(result_coeff[(result_coeff['year_month'] > '2021-08-31') &
           (result_coeff['year_month'] < '2023-09-01')].reset_index().Gini_coeff) # fixed datetime
# Split Data
control_data = pd.DataFrame({'HHI' : entropy[:365],
              'T_d': [0]*len(entropy[:365]),
              'P_t' : [0]*90 + [1]*275})
treated_data = pd.DataFrame({'HHI' : entropy[365:],
              'T_d': [1]*len(entropy[365:]),
              'P_t' : [0]*90 + [1]*275})
df_did = pd.concat([control_data, treated_data], axis = 0).reset_index(drop = True)
# Add date and month feature
df_did['date'] = result_coeff[(result_coeff['year_month'] > '2021-08-31') &
           (result_coeff['year_month'] < '2023-09-01')].reset_index().year_month
df_did['month'] = pd.to_datetime(df_did['date']).dt.month
# Apply log
df_did['ln_y'] = np.log(df_did['HHI'])

In [None]:
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(month)', df_did).fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-12.3150,0.164,-75.264,0.000,-12.636,-11.994
C(month)[T.2],-0.0306,0.041,-0.744,0.457,-0.111,0.050
C(month)[T.3],-0.0005,0.040,-0.013,0.990,-0.079,0.078
C(month)[T.4],0.0878,0.040,2.175,0.030,0.009,0.167
C(month)[T.5],0.1103,0.040,2.755,0.006,0.032,0.189
C(month)[T.6],0.1019,0.040,2.525,0.012,0.023,0.181
C(month)[T.7],0.1394,0.040,3.481,0.001,0.061,0.218
C(month)[T.8],0.0982,0.040,2.453,0.014,0.020,0.177
C(month)[T.9],-0.1621,0.165,-0.980,0.327,-0.487,0.162
