### Number of Posts (Questions and Answers)
- Reference : diff_in_diff/numPosts.ipynb

In [2]:
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import sqlite3
# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, creation_date, views
FROM questions
WHERE creation_date >= '2021-09-01';
'''
questions = pd.read_sql_query(query, conn)
query2 = '''
SELECT id, creation_date
FROM answers
WHERE creation_date >= '2021-09-01';
'''
answers = pd.read_sql_query(query2, conn)
conn.close()

In [5]:
# Convert types (q)
questions['id'] = questions['id'].astype(str)
questions['creation_date'] = pd.to_datetime(questions['creation_date'])
questions['year_month_day'] = questions['creation_date'].dt.to_period('D') # daily
questions['year_month'] = questions['creation_date'].dt.to_period('M')  # monthly
# Convert types (a)
answers['id'] = answers['id'].astype(str)
answers['creation_date'] = pd.to_datetime(answers['creation_date'])
answers['year_month_day'] = answers['creation_date'].dt.to_period('D')
answers['year_month'] = answers['creation_date'].dt.to_period('M')

In [6]:
# Daily Aggregation
daily_q = questions.groupby('year_month_day').size().reset_index(name='count_q')
daily_a = answers.groupby('year_month_day').size().reset_index(name='count_a')
# Merge
df_merge = pd.merge(daily_q, daily_a, on = 'year_month_day')

In [10]:
df_merge

Unnamed: 0,year_month_day,count_q,count_a
0,2021-09-01,4760,6285
1,2021-09-02,4859,6423
2,2021-09-03,4145,5524
3,2021-09-04,2456,3257
4,2021-09-05,2446,3380
...,...,...,...
728,2023-08-30,3773,3193
729,2023-08-31,3674,3135
730,2023-09-01,3133,2805
731,2023-09-02,1942,1671


### DiD Setting

In [11]:
df_merge = df_merge[df_merge['year_month_day'] < '2023-09-01'] # only use data before Sep 1, 2023
control_data = pd.DataFrame({'T_d': [0]*len(df_merge[:365]), 'P_t' : [0]*90 + [1]*275})
treated_data = pd.DataFrame({'T_d': [1]*len(df_merge[365:]), 'P_t' : [0]*90 + [1]*275})
df_did = pd.concat([control_data, treated_data], axis = 0).reset_index(drop = True)

In [13]:
DiD = pd.concat([df_merge, df_did], axis = 1)
DiD['ln_q'] = np.log(DiD['count_q'])
DiD['ln_a'] = np.log(DiD['count_a'])
DiD

Unnamed: 0,year_month_day,count_q,count_a,T_d,P_t,ln_q,ln_a
0,2021-09-01,4760,6285,0,0,8.468003,8.745921
1,2021-09-02,4859,6423,0,0,8.488588,8.767641
2,2021-09-03,4145,5524,0,0,8.329658,8.616858
3,2021-09-04,2456,3257,0,0,7.806289,8.088562
4,2021-09-05,2446,3380,0,0,7.802209,8.125631
...,...,...,...,...,...,...,...
725,2023-08-27,1877,1602,1,1,7.537430,7.379008
726,2023-08-28,3283,2905,1,1,8.096513,7.974189
727,2023-08-29,3556,3052,1,1,8.176392,8.023552
728,2023-08-30,3773,3193,1,1,8.235626,8.068716


### Model Fitting

In [14]:
# Questions
sm.ols('ln_q ~ T_d + P_t + T_d * P_t', DiD).fit(cov_type = 'HC3').summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,8.2528,0.028,297.286,0.000,8.198,8.307
T_d,0.0990,0.040,2.462,0.014,0.020,0.178
P_t,-0.0343,0.032,-1.067,0.286,-0.097,0.029
T_d:P_t,-0.2642,0.048,-5.533,0.000,-0.358,-0.171


In [15]:
sm.ols('ln_q ~ T_d + P_t + T_d * P_t', DiD).fit(cov_type = 'HC3').summary()

0,1,2,3
Dep. Variable:,ln_q,R-squared:,0.112
Model:,OLS,Adj. R-squared:,0.109
Method:,Least Squares,F-statistic:,28.41
Date:,"Mon, 15 Apr 2024",Prob (F-statistic):,2.25e-17
Time:,12:39:32,Log-Likelihood:,-140.11
No. Observations:,730,AIC:,288.2
Df Residuals:,726,BIC:,306.6
Df Model:,3,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,8.2528,0.028,297.286,0.000,8.198,8.307
T_d,0.0990,0.040,2.462,0.014,0.020,0.178
P_t,-0.0343,0.032,-1.067,0.286,-0.097,0.029
T_d:P_t,-0.2642,0.048,-5.533,0.000,-0.358,-0.171

0,1,2,3
Omnibus:,81.997,Durbin-Watson:,0.978
Prob(Omnibus):,0.0,Jarque-Bera (JB):,76.015
Skew:,-0.719,Prob(JB):,3.12e-17
Kurtosis:,2.343,Cond. No.,9.96


In [16]:
# Answers
sm.ols('ln_a ~ T_d + P_t + T_d * P_t', DiD).fit(cov_type = 'HC3').summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,8.5096,0.027,319.084,0.000,8.457,8.562
T_d,-0.1077,0.038,-2.827,0.005,-0.182,-0.033
P_t,-0.0485,0.031,-1.574,0.115,-0.109,0.012
T_d:P_t,-0.2979,0.046,-6.513,0.000,-0.388,-0.208


In [17]:
sm.ols('ln_a ~ T_d + P_t + T_d * P_t', DiD).fit(cov_type = 'HC3').summary()

0,1,2,3
Dep. Variable:,ln_a,R-squared:,0.323
Model:,OLS,Adj. R-squared:,0.32
Method:,Least Squares,F-statistic:,102.2
Date:,"Mon, 15 Apr 2024",Prob (F-statistic):,3.55e-55
Time:,12:39:59,Log-Likelihood:,-121.27
No. Observations:,730,AIC:,250.5
Df Residuals:,726,BIC:,268.9
Df Model:,3,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,8.5096,0.027,319.084,0.000,8.457,8.562
T_d,-0.1077,0.038,-2.827,0.005,-0.182,-0.033
P_t,-0.0485,0.031,-1.574,0.115,-0.109,0.012
T_d:P_t,-0.2979,0.046,-6.513,0.000,-0.388,-0.208

0,1,2,3
Omnibus:,64.059,Durbin-Watson:,0.934
Prob(Omnibus):,0.0,Jarque-Bera (JB):,64.609
Skew:,-0.676,Prob(JB):,9.34e-15
Kurtosis:,2.454,Cond. No.,9.96


In [None]:
# Fixed Effect
#DiD['month'] = pd.to_datetime(DiD['year_month_day']).dt.month
#sm.ols('ln_q ~ T_d + P_t + T_d * P_t + C(month)', DiD).fit(cov_type = 'HC3').summary()
#sm.ols('ln_a ~ T_d + P_t + T_d * P_t + C(month)', df).fit(cov_type = 'HC3').summary()