# Diversity Measure using HHI Score (Arqade)

In [34]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pandas as pd
from nltk import FreqDist
import pickle
import math
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/_Robustness/Arqade/stack.db')
query = '''
SELECT creation_date, tags
FROM questions
WHERE creation_date > '2020-11-30';
'''
df = pd.read_sql_query(query, conn)
conn.close()

In [35]:
df

Unnamed: 0,creation_date,tags
0,2020-11-30 03:33:24.340,"[""|pc|controllers|emulation|windows|""]"
1,2020-11-30 10:13:28.503,"[""|pokemon-sword-shield|nintendo-switch-online..."
2,2020-11-30 14:46:09.063,"[""|ps4|""]"
3,2020-11-30 16:47:24.630,"[""|minecraft-java-edition|minecraft-commands|""]"
4,2020-11-30 17:11:16.553,"[""|cookie-clicker|""]"
...,...,...
8345,2024-03-31 15:59:13.490,"[""|emulation|pokemon-fire-red-leaf-green|""]"
8346,2024-03-31 17:15:05.707,"[""|pikmin|""]"
8347,2024-03-31 17:48:58.123,"[""|pikmin|""]"
8348,2024-03-31 19:44:50.793,"[""|world-of-warcraft|""]"


In [36]:
# erase the first '|'
df['tags'] = df['tags'].apply(lambda x: x[:2] + x[3:] if len(x) > 1 else x)
# erase the last '|'
df['tags'] = df['tags'].apply(
    lambda x: x[:-3] + x[-2:] if len(x) > 1 else x)
# convert the rest '|' with ","
df['tags'] = df['tags'].apply(lambda x: x.replace('|', '", '))

In [37]:
df

Unnamed: 0,creation_date,tags
0,2020-11-30 03:33:24.340,"[""pc"", controllers"", emulation"", windows""]"
1,2020-11-30 10:13:28.503,"[""pokemon-sword-shield"", nintendo-switch-onlin..."
2,2020-11-30 14:46:09.063,"[""ps4""]"
3,2020-11-30 16:47:24.630,"[""minecraft-java-edition"", minecraft-commands""]"
4,2020-11-30 17:11:16.553,"[""cookie-clicker""]"
...,...,...
8345,2024-03-31 15:59:13.490,"[""emulation"", pokemon-fire-red-leaf-green""]"
8346,2024-03-31 17:15:05.707,"[""pikmin""]"
8347,2024-03-31 17:48:58.123,"[""pikmin""]"
8348,2024-03-31 19:44:50.793,"[""world-of-warcraft""]"


### 1) Preprocessing

In [38]:
def wc(text):
    """
    Cleaning function to be used with our first wordcloud
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('-','')
        tags = tags.replace('.','DOT')
        tags = tags.replace('c++','Cpp')
        tags = tags.replace('c#','Csharp')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def clean_tags(text):
    """
    Cleaning function for tags
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def tag_freq(data):
    tags = data['tags'].str.replace('[\["\]]', '', regex=True)
    tags = [tag for i in tags.apply(lambda x: wc(x)) for tag in i.split(', ')]
    result = FreqDist(tags)
    return result

df['creation_date'] = pd.to_datetime(df['creation_date'])
df['year_month'] = df['creation_date'].dt.to_period('D')
df['year_month'] = df['year_month'].astype(str)
year_month = df.year_month.unique()

In [39]:
# Extract keys througout the whole data
all_keys = tag_freq(df)
all_keys = pd.DataFrame(all_keys, index = ['tag']).transpose().reset_index()

# compute tagShare on each month
for i in range(len(year_month)):
    data = df[df['year_month'] == year_month[i]]
    tags = tag_freq(data)
    tagCount = pd.DataFrame(tags, index = ['tag']).transpose().reset_index()
    tagShare = []
    for j in range(len(tagCount)):
        tagShare.append((tagCount['tag'][j] / tagCount['tag'].sum())*100)
    tagCount['tagShare'] = tagShare
    varName = year_month[i].replace('-', '_')
    tagCount = tagCount.rename(columns = {'tag':f'tag_{varName}','tagShare':f'tagShare_{varName}'})
    # merge here.
    all_keys = pd.merge(all_keys, tagCount, on = 'index', how = 'left')

### 2) Calculate HHI Index

In [118]:
# Define square_sum function
def square_sum(numbers):
    result = sum(x**2 for x in numbers if not math.isnan(x))
    return result
# Measure score
HHI_Score = []
# Calculate HHI for each monthly tag share column.
for i in range(3, all_keys.shape[1], 2):
    HHI_Score.append(square_sum(all_keys.iloc[:, i]))
result_HHI = pd.DataFrame({'year_month':year_month, 'HHI_Score':HHI_Score})

In [119]:
result_HHI

Unnamed: 0,year_month,HHI_Score
0,2020-11-30,769.230769
1,2020-12-01,987.654321
2,2020-12-02,750.000000
3,2020-12-03,1597.633136
4,2020-12-04,586.419753
...,...,...
1206,2024-03-27,1428.571429
1207,2024-03-28,2000.000000
1208,2024-03-29,3125.000000
1209,2024-03-30,5000.000000


In [120]:
result_HHI[(result_HHI['year_month'] > '2021-08-31') &
           (result_HHI['year_month'] < '2023-09-01')]

Unnamed: 0,year_month,HHI_Score
275,2021-09-01,1800.000000
276,2021-09-02,1005.917160
277,2021-09-03,1111.111111
278,2021-09-04,1200.000000
279,2021-09-05,1428.571429
...,...,...
995,2023-08-27,5000.000000
996,2023-08-28,1428.571429
997,2023-08-29,1600.000000
998,2023-08-30,2000.000000


In [121]:
# Create the full date range
start_date = '2020-11-30'
end_date = '2023-09-03'
full_date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Identify missing dates
missing_dates = full_date_range.difference(result_HHI['year_month'])

In [122]:
missing_dates

DatetimeIndex(['2022-05-12', '2022-06-25', '2023-04-20', '2023-05-27',
               '2023-07-16'],
              dtype='datetime64[ns]', freq=None)

In [123]:
result_HHI.iloc[np.where(result_HHI['year_month'] == '2022-05-11')[0][0]]['HHI_Score']


3333.3333333333326

In [124]:
# Missing Value Imputation
df_impute = pd.DataFrame({'year_month':['2022-05-12', '2022-06-25', '2023-04-20', '2023-05-27',
               '2023-07-16'],
               'HHI_Score': [result_HHI.iloc[np.where(result_HHI['year_month'] == '2022-05-11')[0][0]]['HHI_Score'],
               result_HHI.iloc[np.where(result_HHI['year_month'] == '2022-06-24')[0][0]]['HHI_Score'],
               result_HHI.iloc[np.where(result_HHI['year_month'] == '2023-04-19')[0][0]]['HHI_Score'],
               result_HHI.iloc[np.where(result_HHI['year_month'] == '2023-05-26')[0][0]]['HHI_Score'],
               result_HHI.iloc[np.where(result_HHI['year_month'] == '2023-07-15')[0][0]]['HHI_Score']
               ]})

In [125]:
result_HHI = pd.concat([result_HHI, df_impute]).sort_values('year_month')

In [126]:
result_HHI

Unnamed: 0,year_month,HHI_Score
0,2020-11-30,769.230769
1,2020-12-01,987.654321
2,2020-12-02,750.000000
3,2020-12-03,1597.633136
4,2020-12-04,586.419753
...,...,...
1206,2024-03-27,1428.571429
1207,2024-03-28,2000.000000
1208,2024-03-29,3125.000000
1209,2024-03-30,5000.000000


### 3) Diff-in-Diff

In [127]:
hhi = list(result_HHI[(result_HHI['year_month'] > '2021-08-31') &
           (result_HHI['year_month'] < '2023-09-01')].reset_index().HHI_Score) # fixed datetime
# Split Data
control_data = pd.DataFrame({'HHI' : hhi[:365],
              'T_d': [0]*len(hhi[:365]),
              'P_t' : [0]*90 + [1]*275})
treated_data = pd.DataFrame({'HHI' : hhi[365:],
              'T_d': [1]*len(hhi[365:]),
              'P_t' : [0]*90 + [1]*275})
df_did = pd.concat([control_data, treated_data], axis = 0).reset_index(drop = True)
# Add date and month feature
df_did['date'] = result_HHI[(result_HHI['year_month'] > '2021-08-31') &
           (result_HHI['year_month'] < '2023-09-01')].reset_index().year_month
df_did['month'] = pd.to_datetime(df_did['date']).dt.month
# Apply log
df_did['ln_y'] = np.log(df_did['HHI'])

### 4) Model Fitting

In [128]:
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(month)', df_did).fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.9843,0.391,17.874,0.000,6.217,7.751
C(month)[T.2],0.1693,0.098,1.726,0.085,-0.023,0.362
C(month)[T.3],0.1432,0.096,1.498,0.135,-0.044,0.331
C(month)[T.4],0.1153,0.096,1.196,0.232,-0.074,0.305
C(month)[T.5],0.3396,0.096,3.552,0.000,0.152,0.527
C(month)[T.6],0.3823,0.096,3.966,0.000,0.193,0.571
C(month)[T.7],0.2488,0.096,2.603,0.009,0.061,0.436
C(month)[T.8],0.3331,0.096,3.484,0.001,0.145,0.521
C(month)[T.9],0.1638,0.395,0.415,0.678,-0.611,0.939


In [129]:
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(month)', df_did).fit().summary()

0,1,2,3
Dep. Variable:,ln_y,R-squared:,0.123
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,7.164
Date:,"Wed, 01 May 2024",Prob (F-statistic):,5.03e-14
Time:,12:55:58,Log-Likelihood:,-567.9
No. Observations:,730,AIC:,1166.0
Df Residuals:,715,BIC:,1235.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.9843,0.391,17.874,0.000,6.217,7.751
C(month)[T.2],0.1693,0.098,1.726,0.085,-0.023,0.362
C(month)[T.3],0.1432,0.096,1.498,0.135,-0.044,0.331
C(month)[T.4],0.1153,0.096,1.196,0.232,-0.074,0.305
C(month)[T.5],0.3396,0.096,3.552,0.000,0.152,0.527
C(month)[T.6],0.3823,0.096,3.966,0.000,0.193,0.571
C(month)[T.7],0.2488,0.096,2.603,0.009,0.061,0.436
C(month)[T.8],0.3331,0.096,3.484,0.001,0.145,0.521
C(month)[T.9],0.1638,0.395,0.415,0.678,-0.611,0.939

0,1,2,3
Omnibus:,53.181,Durbin-Watson:,2.099
Prob(Omnibus):,0.0,Jarque-Bera (JB):,63.111
Skew:,0.685,Prob(JB):,1.98e-14
Kurtosis:,3.443,Cond. No.,65.2
