# Compute Techiness without Outliers
- Outliers: ln_entropy 계산 간 발생. (too small volume of counts in some cluster)

### 1) Load Data

In [1]:
import pandas as pd
import pickle

# Load Data (preChatGPT)
with open(file = 'ques_df_pre.pickle', mode = 'rb') as file:
    ques_df = pickle.load(file)
with open(file = 'ans_df_pre.pickle', mode = 'rb') as file:
    ans_df = pickle.load(file)

In [24]:
ques_df['creation_date'] = pd.to_datetime(ques_df['creation_date'])
ques_df['year_month_day'] = ques_df['creation_date'].dt.to_period('D') # daily
daily_q = ques_df.groupby(['year_month_day', 'community']).size().reset_index(name = 'count_q')

ans_df['creation_date'] = pd.to_datetime(ans_df['creation_date'])
ans_df['year_month_day'] = ans_df['creation_date'].dt.to_period('D') # daily
daily_a = ans_df.groupby(['year_month_day', 'community']).size().reset_index(name = 'count_a')

df_merge = pd.merge(daily_q, daily_a, on = ['year_month_day', 'community'])
df_merge['year_month_day'] = df_merge['year_month_day'].astype(str)

In [25]:
df_merge

Unnamed: 0,year_month_day,community,count_q,count_a
0,2021-09-01,0,1213,1029
1,2021-09-01,1,401,225
2,2021-09-01,3,394,250
3,2021-09-01,4,368,305
4,2021-09-01,5,378,258
...,...,...,...,...
10746,2023-08-31,11,18,12
10747,2023-08-31,13,29,18
10748,2023-08-31,14,8,1
10749,2023-08-31,15,5,4


In [30]:
df_merge[df_merge['year_month_day'] == '2021-09-01'] # there are some missing communities as they are zero.

Unnamed: 0,year_month_day,community,count_q,count_a
0,2021-09-01,0,1213,1029
1,2021-09-01,1,401,225
2,2021-09-01,3,394,250
3,2021-09-01,4,368,305
4,2021-09-01,5,378,258
5,2021-09-01,6,470,405
6,2021-09-01,7,231,126
7,2021-09-01,8,103,79
8,2021-09-01,9,35,16
9,2021-09-01,10,5,2


In [26]:
# 1) Calculate Code Ratio (code_body / full_body)
ques_df['techiness'] = ques_df['code_body'].apply(len) / ques_df['full_body'].apply(len)
# 2) Community Aggregation
agg_q_monthly = ques_df.groupby('community')['techiness'].mean().reset_index()
# Copy the data vertically 730 times (copy for each day)
agg_q_monthly = pd.concat([agg_q_monthly] * 730, ignore_index=True) # this should be 13140
agg_q_monthly

Unnamed: 0,community,techiness
0,0,0.525040
1,1,0.412791
2,2,0.127386
3,3,0.490224
4,4,0.439161
...,...,...
13135,13,0.421145
13136,14,0.403302
13137,15,0.307775
13138,16,0.337905


In [27]:
# Load entropy data
with open(file = '/data1/StackOverflow/_Robustness/TagCluster/ent_Score.pickle', mode = 'rb') as file:
    ent_Score = pickle.load(file)
# Merge Data
agg_q_monthly['entropy'] = ent_Score
agg_q_monthly

Unnamed: 0,community,techiness,entropy
0,0,0.525040,65.454193
1,1,0.412791,75.610966
2,2,0.127386,0.000000
3,3,0.490224,58.728172
4,4,0.439161,106.606754
...,...,...,...
13135,13,0.421145,-111.576775
13136,14,0.403302,-329.915781
13137,15,0.307775,-369.615341
13138,16,0.337905,0.000000


### 2) Load Pre Computed DiD Tables

In [45]:
# Load pre-computed did tables
post = pd.read_csv("/data1/StackOverflow/diff_in_diff/numPosts_did.csv")
entropy = pd.read_csv('/data1/StackOverflow/_Robustness/entropy_did.csv')
lsm = pd.read_csv('/data1/StackOverflow/diff_in_diff/lsm_did2_modified.csv')

post = post.drop(['count_q', 'count_a', 'ln_q', 'ln_a'], axis = 1)
# Copy code 19 times (19 clusters)
extend_post = pd.concat([post.loc[post.index.repeat(18)].reset_index(drop=True)])
df_final = pd.concat([extend_post, agg_q_monthly], axis=1)
df_final

Unnamed: 0,year_month_day,T_d,P_t,month,community,techiness,entropy
0,2021-09-01,0,0,9,0,0.525040,65.454193
1,2021-09-01,0,0,9,1,0.412791,75.610966
2,2021-09-01,0,0,9,2,0.127386,0.000000
3,2021-09-01,0,0,9,3,0.490224,58.728172
4,2021-09-01,0,0,9,4,0.439161,106.606754
...,...,...,...,...,...,...,...
13135,2023-08-31,1,1,8,13,0.421145,-111.576775
13136,2023-08-31,1,1,8,14,0.403302,-329.915781
13137,2023-08-31,1,1,8,15,0.307775,-369.615341
13138,2023-08-31,1,1,8,16,0.337905,0.000000


In [47]:
# Left merge and convert na values to 0.
df_final = pd.merge(df_final, df_merge, on = ['year_month_day', 'community'], how = 'left')
df_final['count_q'] = df_final['count_q'].fillna(0)
df_final['count_a'] = df_final['count_a'].fillna(0)
df_final['ln_q'] = np.log(df_final['count_q'])
df_final['ln_a'] = np.log(df_final['count_a'])
df_final

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,year_month_day,T_d,P_t,month,community,techiness,entropy,count_q,count_a,ln_q,ln_a
0,2021-09-01,0,0,9,0,0.525040,65.454193,1213.0,1029.0,7.100852,6.936343
1,2021-09-01,0,0,9,1,0.412791,75.610966,401.0,225.0,5.993961,5.416100
2,2021-09-01,0,0,9,2,0.127386,0.000000,0.0,0.0,-inf,-inf
3,2021-09-01,0,0,9,3,0.490224,58.728172,394.0,250.0,5.976351,5.521461
4,2021-09-01,0,0,9,4,0.439161,106.606754,368.0,305.0,5.908083,5.720312
...,...,...,...,...,...,...,...,...,...,...,...
13135,2023-08-31,1,1,8,13,0.421145,-111.576775,29.0,18.0,3.367296,2.890372
13136,2023-08-31,1,1,8,14,0.403302,-329.915781,8.0,1.0,2.079442,0.000000
13137,2023-08-31,1,1,8,15,0.307775,-369.615341,5.0,4.0,1.609438,1.386294
13138,2023-08-31,1,1,8,16,0.337905,0.000000,0.0,0.0,-inf,-inf


### Outlier Control (clusters with all NA values or few values)

In [48]:
# Alternative Method
import numpy as np
df_final['ln_entropy'] = np.log(df_final['entropy'])
df_final.replace(-np.inf, np.nan, inplace=True)
df_final = df_final.dropna()

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [49]:
df_final

Unnamed: 0,year_month_day,T_d,P_t,month,community,techiness,entropy,count_q,count_a,ln_q,ln_a,ln_entropy
0,2021-09-01,0,0,9,0,0.525040,65.454193,1213.0,1029.0,7.100852,6.936343,4.181351
1,2021-09-01,0,0,9,1,0.412791,75.610966,401.0,225.0,5.993961,5.416100,4.325601
3,2021-09-01,0,0,9,3,0.490224,58.728172,394.0,250.0,5.976351,5.521461,4.072920
4,2021-09-01,0,0,9,4,0.439161,106.606754,368.0,305.0,5.908083,5.720312,4.669147
5,2021-09-01,0,0,9,5,0.461350,39.936141,378.0,258.0,5.934894,5.552960,3.687282
...,...,...,...,...,...,...,...,...,...,...,...,...
13126,2023-08-31,1,1,8,4,0.439161,136.151153,320.0,237.0,5.768321,5.468060,4.913766
13127,2023-08-31,1,1,8,5,0.461350,58.643092,309.0,220.0,5.733341,5.393628,4.071470
13128,2023-08-31,1,1,8,6,0.414720,85.398315,337.0,215.0,5.820083,5.370638,4.447326
13129,2023-08-31,1,1,8,7,0.338053,75.784015,220.0,133.0,5.393628,4.890349,4.327887


In [52]:
# Final Setup
df_final['year_month_day'] = pd.to_datetime(df_final['year_month_day'])
df_final['year_month'] = df_final['year_month_day'].dt.to_period('M') # monthly
df_final['year_month'] = df_final['year_month'].astype(str)
df_final['year_month_day'] = df_final['year_month_day'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['year_month_day'] = pd.to_datetime(df_final['year_month_day'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['year_month'] = df_final['year_month_day'].dt.to_period('M') # monthly
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['year_month'] = df_final['year_month'].astyp

In [53]:
df_final

Unnamed: 0,year_month_day,T_d,P_t,month,community,techiness,entropy,count_q,count_a,ln_q,ln_a,ln_entropy,year_month
0,2021-09-01,0,0,9,0,0.525040,65.454193,1213.0,1029.0,7.100852,6.936343,4.181351,2021-09
1,2021-09-01,0,0,9,1,0.412791,75.610966,401.0,225.0,5.993961,5.416100,4.325601,2021-09
3,2021-09-01,0,0,9,3,0.490224,58.728172,394.0,250.0,5.976351,5.521461,4.072920,2021-09
4,2021-09-01,0,0,9,4,0.439161,106.606754,368.0,305.0,5.908083,5.720312,4.669147,2021-09
5,2021-09-01,0,0,9,5,0.461350,39.936141,378.0,258.0,5.934894,5.552960,3.687282,2021-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13126,2023-08-31,1,1,8,4,0.439161,136.151153,320.0,237.0,5.768321,5.468060,4.913766,2023-08
13127,2023-08-31,1,1,8,5,0.461350,58.643092,309.0,220.0,5.733341,5.393628,4.071470,2023-08
13128,2023-08-31,1,1,8,6,0.414720,85.398315,337.0,215.0,5.820083,5.370638,4.447326,2023-08
13129,2023-08-31,1,1,8,7,0.338053,75.784015,220.0,133.0,5.393628,4.890349,4.327887,2023-08


In [56]:
df_final.describe()

Unnamed: 0,T_d,P_t,month,community,techiness,entropy,count_q,count_a,ln_q,ln_a,ln_entropy
count,5267.0,5267.0,5267.0,5267.0,5267.0,5267.0,5267.0,5267.0,5267.0,5267.0,5267.0
mean,0.518891,0.750522,6.519271,5.445035,0.447694,69.403838,472.20505,441.103664,5.975807,5.863166,4.028846
std,0.49969,0.432752,3.472063,5.140355,0.054036,33.985322,305.337382,318.822804,0.592675,0.663436,0.843314
min,0.0,0.0,1.0,0.0,0.338053,0.00397,70.0,34.0,4.248495,3.526361,-5.528965
25%,0.0,1.0,3.0,1.0,0.41472,46.286351,269.0,221.0,5.594711,5.398163,3.834847
50%,1.0,1.0,7.0,4.0,0.46135,69.5555,366.0,334.0,5.902633,5.811141,4.242125
75%,1.0,1.0,10.0,7.0,0.490224,90.755735,592.5,539.0,6.384351,6.289716,4.508172
max,1.0,1.0,12.0,17.0,0.52504,165.691735,1437.0,1364.0,7.270313,7.218177,5.110129


In [54]:
# Save Data
df_final.to_csv('df_final_pre2.csv', index=False)