# Diversity Measure using HHI Score (Arqade)

In [25]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pandas as pd
from nltk import FreqDist
import pickle
import math
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/_Robustness/English/stack.db')
query = '''
SELECT creation_date, tags
FROM questions
WHERE creation_date > '2020-11-30';
'''
df = pd.read_sql_query(query, conn)
conn.close()

In [26]:
df

Unnamed: 0,creation_date,tags
0,2020-11-30 02:46:43.953,"[""|phonology|phonetics|""]"
1,2020-11-30 02:55:08.383,"[""|single-word-requests|american-english|vocab..."
2,2020-11-30 04:53:01.393,"[""|category|""]"
3,2020-11-30 07:16:46.067,"[""|phrases|idioms|quotes|proverbs|""]"
4,2020-11-30 08:17:44.307,"[""|grammar|word-usage|vocabulary|""]"
...,...,...
14228,2024-03-31 11:45:02.430,"[""|meaning|ambiguity|""]"
14229,2024-03-31 13:19:22.587,"[""|meaning|""]"
14230,2024-03-31 20:36:14.580,"[""|conjunctions|""]"
14231,2024-03-31 22:44:12.233,"[""|single-word-requests|phrase-requests|""]"


In [27]:
# erase the first '|'
df['tags'] = df['tags'].apply(lambda x: x[:2] + x[3:] if len(x) > 1 else x)
# erase the last '|'
df['tags'] = df['tags'].apply(
    lambda x: x[:-3] + x[-2:] if len(x) > 1 else x)
# convert the rest '|' with ","
df['tags'] = df['tags'].apply(lambda x: x.replace('|', '", '))

In [4]:
df

Unnamed: 0,creation_date,tags
0,2020-11-30 02:46:43.953,"[""phonology"", phonetics""]"
1,2020-11-30 02:55:08.383,"[""single-word-requests"", american-english"", vo..."
2,2020-11-30 04:53:01.393,"[""category""]"
3,2020-11-30 07:16:46.067,"[""phrases"", idioms"", quotes"", proverbs""]"
4,2020-11-30 08:17:44.307,"[""grammar"", word-usage"", vocabulary""]"
...,...,...
14228,2024-03-31 11:45:02.430,"[""meaning"", ambiguity""]"
14229,2024-03-31 13:19:22.587,"[""meaning""]"
14230,2024-03-31 20:36:14.580,"[""conjunctions""]"
14231,2024-03-31 22:44:12.233,"[""single-word-requests"", phrase-requests""]"


### 1) Preprocessing

In [28]:
def wc(text):
    """
    Cleaning function to be used with our first wordcloud
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('-','')
        tags = tags.replace('.','DOT')
        tags = tags.replace('c++','Cpp')
        tags = tags.replace('c#','Csharp')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def clean_tags(text):
    """
    Cleaning function for tags
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def tag_freq(data):
    tags = data['tags'].str.replace('[\["\]]', '', regex=True)
    tags = [tag for i in tags.apply(lambda x: wc(x)) for tag in i.split(', ')]
    result = FreqDist(tags)
    return result

df['creation_date'] = pd.to_datetime(df['creation_date'])
df['year_month'] = df['creation_date'].dt.to_period('D')
df['year_month'] = df['year_month'].astype(str)
year_month = df.year_month.unique()

In [29]:
# Extract keys througout the whole data
all_keys = tag_freq(df)
all_keys = pd.DataFrame(all_keys, index = ['tag']).transpose().reset_index()

# compute tagShare on each month
for i in range(len(year_month)):
    data = df[df['year_month'] == year_month[i]]
    tags = tag_freq(data)
    tagCount = pd.DataFrame(tags, index = ['tag']).transpose().reset_index()
    tagShare = []
    for j in range(len(tagCount)):
        tagShare.append((tagCount['tag'][j] / tagCount['tag'].sum())*100)
    tagCount['tagShare'] = tagShare
    varName = year_month[i].replace('-', '_')
    tagCount = tagCount.rename(columns = {'tag':f'tag_{varName}','tagShare':f'tagShare_{varName}'})
    # merge here.
    all_keys = pd.merge(all_keys, tagCount, on = 'index', how = 'left')

### 2) Calculate HHI Index

In [7]:
# Define square_sum function
def square_sum(numbers):
    result = sum(x**2 for x in numbers if not math.isnan(x))
    return result
# Measure score
HHI_Score = []
# Calculate HHI for each monthly tag share column.
for i in range(3, all_keys.shape[1], 2):
    HHI_Score.append(square_sum(all_keys.iloc[:, i]))
result_HHI = pd.DataFrame({'year_month':year_month, 'HHI_Score':HHI_Score})

In [8]:
result_HHI

Unnamed: 0,year_month,HHI_Score
0,2020-11-30,495.537088
1,2020-12-01,238.751148
2,2020-12-02,287.603306
3,2020-12-03,366.942149
4,2020-12-04,312.213039
...,...,...
1213,2024-03-27,416.171225
1214,2024-03-28,781.250000
1215,2024-03-29,590.277778
1216,2024-03-30,1468.144044


In [9]:
result_HHI[(result_HHI['year_month'] > '2021-08-31') &
           (result_HHI['year_month'] < '2023-09-01')]

Unnamed: 0,year_month,HHI_Score
275,2021-09-01,373.961219
276,2021-09-02,665.224446
277,2021-09-03,560.000000
278,2021-09-04,859.375000
279,2021-09-05,623.818526
...,...,...
1000,2023-08-27,1200.000000
1001,2023-08-28,329.908058
1002,2023-08-29,702.947846
1003,2023-08-30,692.520776


In [16]:
all_keys['tag_2021_09_01']

0      NaN
1      NaN
2      1.0
3      NaN
4      NaN
      ... 
841    NaN
842    NaN
843    NaN
844    NaN
845    NaN
Name: tag_2021_09_01, Length: 846, dtype: float64

In [18]:
sum(all_keys['tag_2021_09_01'].isna())

815

### 3) Diff-in-Diff

In [10]:
hhi = list(result_HHI[(result_HHI['year_month'] > '2021-08-31') &
           (result_HHI['year_month'] < '2023-09-01')].reset_index().HHI_Score) # fixed datetime
# Split Data
control_data = pd.DataFrame({'HHI' : hhi[:365],
              'T_d': [0]*len(hhi[:365]),
              'P_t' : [0]*90 + [1]*275})
treated_data = pd.DataFrame({'HHI' : hhi[365:],
              'T_d': [1]*len(hhi[365:]),
              'P_t' : [0]*90 + [1]*275})
df_did = pd.concat([control_data, treated_data], axis = 0).reset_index(drop = True)
# Add date and month feature
df_did['date'] = result_HHI[(result_HHI['year_month'] > '2021-08-31') &
           (result_HHI['year_month'] < '2023-09-01')].reset_index().year_month
df_did['month'] = pd.to_datetime(df_did['date']).dt.month
# Apply log
df_did['ln_y'] = np.log(df_did['HHI'])

### 4) Model Fitting

In [11]:
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(month)', df_did).fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.3273,0.275,22.970,0.000,5.786,6.868
C(month)[T.2],0.0525,0.069,0.759,0.448,-0.083,0.188
C(month)[T.3],0.1146,0.067,1.701,0.089,-0.018,0.247
C(month)[T.4],0.0870,0.068,1.280,0.201,-0.046,0.220
C(month)[T.5],0.1429,0.067,2.120,0.034,0.011,0.275
C(month)[T.6],0.1611,0.068,2.372,0.018,0.028,0.295
C(month)[T.7],0.2509,0.067,3.722,0.000,0.119,0.383
C(month)[T.8],0.2442,0.067,3.624,0.000,0.112,0.377
C(month)[T.9],-0.0099,0.278,-0.036,0.972,-0.556,0.536


In [12]:
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(month)', df_did).fit().summary()

0,1,2,3
Dep. Variable:,ln_y,R-squared:,0.057
Model:,OLS,Adj. R-squared:,0.039
Method:,Least Squares,F-statistic:,3.099
Date:,"Wed, 01 May 2024",Prob (F-statistic):,0.000104
Time:,12:59:23,Log-Likelihood:,-312.68
No. Observations:,730,AIC:,655.4
Df Residuals:,715,BIC:,724.3
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.3273,0.275,22.970,0.000,5.786,6.868
C(month)[T.2],0.0525,0.069,0.759,0.448,-0.083,0.188
C(month)[T.3],0.1146,0.067,1.701,0.089,-0.018,0.247
C(month)[T.4],0.0870,0.068,1.280,0.201,-0.046,0.220
C(month)[T.5],0.1429,0.067,2.120,0.034,0.011,0.275
C(month)[T.6],0.1611,0.068,2.372,0.018,0.028,0.295
C(month)[T.7],0.2509,0.067,3.722,0.000,0.119,0.383
C(month)[T.8],0.2442,0.067,3.624,0.000,0.112,0.377
C(month)[T.9],-0.0099,0.278,-0.036,0.972,-0.556,0.536

0,1,2,3
Omnibus:,103.379,Durbin-Watson:,1.98
Prob(Omnibus):,0.0,Jarque-Bera (JB):,179.529
Skew:,0.881,Prob(JB):,1.04e-39
Kurtosis:,4.672,Cond. No.,65.2
