In [1]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
from nltk import FreqDist
import pickle
import math
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
with open('/data1/StackOverflow/Tag_Analysis/df_tags_2023.pickle', 'rb') as fr:
    df_tags = pickle.load(fr)
def wc(text):
    """
    Cleaning function to be used with our first wordcloud
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('-','')
        tags = tags.replace('.','DOT')
        tags = tags.replace('c++','Cpp')
        tags = tags.replace('c#','Csharp')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def clean_tags(text):
    """
    Cleaning function for tags
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def tag_freq(data):
    tags = data['tags'].str.replace('[\["\]]', '', regex=True)
    tags = [tag for i in tags.apply(lambda x: wc(x)) for tag in i.split(', ')]
    result = FreqDist(tags)
    return result
# Preprocessing
df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
df_tags['year_month'] = df_tags['creation_date'].dt.to_period('D')
df_tags['year_month'] = df_tags['year_month'].astype(str)
year_month = df_tags.year_month.unique()

In [2]:
# load pickle
with open('/data1/StackOverflow/diff_in_diff/daily_tagShare_modified.pickle', 'rb') as fr:
    all_keys = pickle.load(fr)

In [3]:
all_keys

Unnamed: 0,index,tag,tag_2020_11_30,tagShare_2020_11_30,tag_2020_12_01,tagShare_2020_12_01,tag_2020_12_02,tagShare_2020_12_02,tag_2020_12_03,tagShare_2020_12_03,...,tag_2023_08_30,tagShare_2023_08_30,tag_2023_08_31,tagShare_2023_08_31,tag_2023_09_01,tagShare_2023_09_01,tag_2023_09_02,tagShare_2023_09_02,tag_2023_09_03,tagShare_2023_09_03
0,amazonwebservices,52132,47.0,0.315352,59.0,0.368819,57.0,0.352505,86.0,0.493657,...,47.0,0.409087,57.0,0.503890,38.0,0.389744,25.0,0.41625,13.0,0.689655
1,forloop,16939,19.0,0.127483,21.0,0.131275,27.0,0.166976,32.0,0.183686,...,11.0,0.095744,4.0,0.035361,4.0,0.041026,3.0,0.04995,3.0,0.159151
2,foreach,3614,9.0,0.060386,5.0,0.031256,3.0,0.018553,6.0,0.034441,...,3.0,0.026112,2.0,0.017680,1.0,0.010256,1.0,0.01665,,
3,terraform,11491,13.0,0.087225,12.0,0.075014,10.0,0.061843,8.0,0.045922,...,13.0,0.113152,17.0,0.150283,13.0,0.133333,2.0,0.03330,,
4,awsglue,2203,3.0,0.020129,5.0,0.031256,2.0,0.012369,1.0,0.005740,...,2.0,0.017408,,,2.0,0.020513,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51948,applespeech,1,,,,,,,,,...,,,,,,,,,1.0,0.053050
51949,jsonloader,1,,,,,,,,,...,,,,,,,,,1.0,0.053050
51950,livescript,1,,,,,,,,,...,,,,,,,,,1.0,0.053050
51951,nxhtml,1,,,,,,,,,...,,,,,,,,,1.0,0.053050


### Gini Coefficient

In [4]:
def calculate_gini(shares):
    shares = sorted(shares)
    size = len(shares)
    total_sum = sum(shares)
    abs_diffs = 0
    for i in range(size):
        for j in range(size):
            abs_diffs += abs(shares[i]-shares[j])
    gini_coeff = abs_diffs / (2 * size * total_sum)
    return gini_coeff

In [17]:
Gini_coeff = []
for i in range(3, all_keys.shape[1], 2):
    arr = all_keys.iloc[:,i]
    arr = arr[~np.isnan(arr)]
    Gini_coeff.append(calculate_gini(arr))
result_coeff = pd.DataFrame({'year_month':year_month, 'Gini_coeff':Gini_coeff})

In [27]:
#with open('gini_coeff.pkl', "wb") as file:
#        pickle.dump(result_coeff, file)

In [18]:
entropy = list(result_coeff[(result_coeff['year_month'] > '2021-08-31') &
           (result_coeff['year_month'] < '2023-09-01')].reset_index().Gini_coeff) # fixed datetime
# Split Data
control_data = pd.DataFrame({'HHI' : entropy[:365],
              'T_d': [0]*len(entropy[:365]),
              'P_t' : [0]*90 + [1]*275})
treated_data = pd.DataFrame({'HHI' : entropy[365:],
              'T_d': [1]*len(entropy[365:]),
              'P_t' : [0]*90 + [1]*275})
df_did = pd.concat([control_data, treated_data], axis = 0).reset_index(drop = True)
# Add date and month feature
df_did['date'] = result_coeff[(result_coeff['year_month'] > '2021-08-31') &
           (result_coeff['year_month'] < '2023-09-01')].reset_index().year_month
df_did['month'] = pd.to_datetime(df_did['date']).dt.month
# Apply log
df_did['ln_y'] = np.log(df_did['HHI'])

In [25]:
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(month)', df_did).fit(cov_type='HC3').summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.4484,0.054,-8.296,0.000,-0.554,-0.342
C(month)[T.2],-0.0024,0.006,-0.383,0.702,-0.015,0.010
C(month)[T.3],-0.0108,0.006,-1.813,0.070,-0.022,0.001
C(month)[T.4],-0.0298,0.006,-4.779,0.000,-0.042,-0.018
C(month)[T.5],-0.0390,0.007,-5.988,0.000,-0.052,-0.026
C(month)[T.6],-0.0428,0.007,-6.348,0.000,-0.056,-0.030
C(month)[T.7],-0.0471,0.007,-6.905,0.000,-0.060,-0.034
C(month)[T.8],-0.0428,0.007,-6.546,0.000,-0.056,-0.030
C(month)[T.9],0.0325,0.054,0.601,0.548,-0.074,0.139


In [22]:
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(month)', df_did).fit(cov_type='HC3').summary()

0,1,2,3
Dep. Variable:,ln_y,R-squared:,0.609
Model:,OLS,Adj. R-squared:,0.601
Method:,Least Squares,F-statistic:,66.73
Date:,"Mon, 20 May 2024",Prob (F-statistic):,1.7599999999999998e-119
Time:,14:14:04,Log-Likelihood:,1446.0
No. Observations:,730,AIC:,-2862.0
Df Residuals:,715,BIC:,-2793.0
Df Model:,14,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.4484,0.054,-8.296,0.000,-0.554,-0.342
C(month)[T.2],-0.0024,0.006,-0.383,0.702,-0.015,0.010
C(month)[T.3],-0.0108,0.006,-1.813,0.070,-0.022,0.001
C(month)[T.4],-0.0298,0.006,-4.779,0.000,-0.042,-0.018
C(month)[T.5],-0.0390,0.007,-5.988,0.000,-0.052,-0.026
C(month)[T.6],-0.0428,0.007,-6.348,0.000,-0.056,-0.030
C(month)[T.7],-0.0471,0.007,-6.905,0.000,-0.060,-0.034
C(month)[T.8],-0.0428,0.007,-6.546,0.000,-0.056,-0.030
C(month)[T.9],0.0325,0.054,0.601,0.548,-0.074,0.139

0,1,2,3
Omnibus:,47.719,Durbin-Watson:,0.848
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56.097
Skew:,-0.676,Prob(JB):,6.59e-13
Kurtosis:,2.865,Cond. No.,65.2
