In [1]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
from nltk import FreqDist
import pickle
import math

In [2]:
# Imort Dataset I (Post: Q, A, creation_date, tags)
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, post_type, creation_date, owner_user_id, tags, body
FROM posts
WHERE creation_date > '2021-09-01'
AND creation_date < '2023-09-01';
'''
df = pd.read_sql_query(query, conn)
conn.close()

In [3]:
# Preprocess
df = df.dropna(subset=['owner_user_id'])
df['owner_user_id'] = df['owner_user_id'].astype(int).astype(str)
df['creation_date'] = pd.to_datetime(df['creation_date'])
df['year_month_day'] = df['creation_date'].dt.to_period('D')
df['year_month_day'] = df['year_month_day'].astype(str)

In [4]:
df

Unnamed: 0,id,post_type,creation_date,owner_user_id,tags,body,year_month_day
0,69006420,answer,2021-09-01 00:00:18.070,7332046,,"<p>The short approach, remove python3.9 from y...",2021-09-01
1,69006421,answer,2021-09-01 00:00:28.823,14471093,,"<p>open the terminal, run as administrator.</p...",2021-09-01
2,69006422,answer,2021-09-01 00:00:35.537,523612,,"<p>Conceptually, <code>Base</code> has no reas...",2021-09-01
3,69006423,question,2021-09-01 00:00:35.237,14087917,"[""node.js"", ""reactjs"", ""express"", ""axios"", ""re...",<p>Using interceptors for the first time to re...,2021-09-01
4,69006426,question,2021-09-01 00:00:55.583,13091928,"[""r"", ""shiny""]","<p>The APP below uses <code>navbarPage</code>,...",2021-09-01
...,...,...,...,...,...,...,...
5844592,77019853,answer,2023-08-31 23:56:47.647,5103949,,"<p>I had the same error, I fixed it by followi...",2023-08-31
5844593,77019854,question,2023-08-31 23:57:28.633,2532775,"[""angular"", ""typescript"", ""progressive-web-app...",<p>I utilized the guidelines presented in <a h...,2023-08-31
5844594,77019855,answer,2023-08-31 23:57:36.797,16844882,,"<p>In your Razor component, you can inject the...",2023-08-31
5844595,77019856,question,2023-08-31 23:58:56.693,2604570,"[""next.js""]","<p>I have the following structure, i am using ...",2023-08-31


In [5]:
# Load user data with user_type information
user_df = pd.read_csv("split_power_casual.csv")
# Preprocess
df = df.dropna(subset=['owner_user_id'])
df['owner_user_id'] = df['owner_user_id'].astype(int).astype(str)
df['creation_date'] = pd.to_datetime(df['creation_date'])
df['year_month_day'] = df['creation_date'].dt.to_period('D')
df['year_month_day'] = df['year_month_day'].astype(str)
user_df['owner_user_id'] = user_df['owner_user_id'].astype(str)
df_merge = pd.merge(df, user_df[['owner_user_id', 'user_type']], on = 'owner_user_id', how = 'left')
df_merge

Unnamed: 0,id,post_type,creation_date,owner_user_id,tags,body,year_month_day,user_type
0,69006420,answer,2021-09-01 00:00:18.070,7332046,,"<p>The short approach, remove python3.9 from y...",2021-09-01,casual
1,69006421,answer,2021-09-01 00:00:28.823,14471093,,"<p>open the terminal, run as administrator.</p...",2021-09-01,casual
2,69006422,answer,2021-09-01 00:00:35.537,523612,,"<p>Conceptually, <code>Base</code> has no reas...",2021-09-01,top
3,69006423,question,2021-09-01 00:00:35.237,14087917,"[""node.js"", ""reactjs"", ""express"", ""axios"", ""re...",<p>Using interceptors for the first time to re...,2021-09-01,casual
4,69006426,question,2021-09-01 00:00:55.583,13091928,"[""r"", ""shiny""]","<p>The APP below uses <code>navbarPage</code>,...",2021-09-01,intensive
...,...,...,...,...,...,...,...,...
5790172,77019853,answer,2023-08-31 23:56:47.647,5103949,,"<p>I had the same error, I fixed it by followi...",2023-08-31,casual
5790173,77019854,question,2023-08-31 23:57:28.633,2532775,"[""angular"", ""typescript"", ""progressive-web-app...",<p>I utilized the guidelines presented in <a h...,2023-08-31,casual
5790174,77019855,answer,2023-08-31 23:57:36.797,16844882,,"<p>In your Razor component, you can inject the...",2023-08-31,intensive
5790175,77019856,question,2023-08-31 23:58:56.693,2604570,"[""next.js""]","<p>I have the following structure, i am using ...",2023-08-31,casual


In [6]:
df_merge.isna().sum()

id                      0
post_type               0
creation_date           0
owner_user_id           0
tags              3106869
body                    0
year_month_day          0
user_type            1844
dtype: int64

In [7]:
df_merge['user_type'].unique()

array(['casual', 'top', 'intensive', nan], dtype=object)

In [24]:
# replicable하게 쓰기 위해서는 questions랑, answers 데이터를 split sample로 준비해서 투입. (ex. old_user_q_df.csv vs . old_user_a_df.csv)
    # Questions data -> for # Questions and Entropy measures
    # Answers data -> for # Answers and LSM measures
questions = df[df['post_type'] == 'question']
answers = df[df['post_type'] == 'answer']

### 1. Volume (Q, A, Comments, Answer per Question, Comments per Post)

In [62]:
# Load Baseline (year_month_day, T_d, P_t)
baseline = pd.read_csv('/data1/StackOverflow/_Final/lsm_new.csv')
baseline = baseline[['year_month_day', 'T_d', 'P_t', 'month']]

# Number of Questions
df_q = questions.groupby('year_month_day').size().reset_index(name = 'q')
df_a = answers.groupby('year_month_day').size().reset_index(name = 'a')
# Merging the dataframes
df_final = pd.merge(df_q, df_a, on='year_month_day', how='outer').fillna(0)
# Convert columns to integers
df_final['q'] = df_final['q'].astype(int)
df_final['a'] = df_final['a'].astype(int)
df_final['ln_q'] = np.log(df_final['q'])
df_final['ln_a'] = np.log(df_final['a'])
df_final

Unnamed: 0,year_month_day,q,a,ln_q,ln_a
0,2021-09-01,4677,6237,8.450412,8.738255
1,2021-09-02,4765,6372,8.469053,8.759669
2,2021-09-03,4077,5476,8.313117,8.608130
3,2021-09-04,2394,3233,7.780721,8.081166
4,2021-09-05,2385,3345,7.776954,8.115222
...,...,...,...,...,...
725,2023-08-27,1876,1601,7.536897,7.378384
726,2023-08-28,3283,2905,8.096513,7.974189
727,2023-08-29,3552,3050,8.175266,8.022897
728,2023-08-30,3773,3193,8.235626,8.068716


In [65]:
df_final = pd.merge(baseline, df_final, on = 'year_month_day', how = 'left')
df_final

Unnamed: 0,year_month_day,T_d,P_t,month,q,a,ln_q,ln_a
0,2021-09-01,0,0,9,4677,6237,8.450412,8.738255
1,2021-09-02,0,0,9,4765,6372,8.469053,8.759669
2,2021-09-03,0,0,9,4077,5476,8.313117,8.608130
3,2021-09-04,0,0,9,2394,3233,7.780721,8.081166
4,2021-09-05,0,0,9,2385,3345,7.776954,8.115222
...,...,...,...,...,...,...,...,...
725,2023-08-27,1,1,8,1876,1601,7.536897,7.378384
726,2023-08-28,1,1,8,3283,2905,8.096513,7.974189
727,2023-08-29,1,1,8,3552,3050,8.175266,8.022897
728,2023-08-30,1,1,8,3773,3193,8.235626,8.068716


### 2. Entropy

In [None]:
# _Robustness/entropy_aom.ipynb
# _Robustness/H4/ent_old_vr2.py
# _Robustness/H4/vr2.ipynb

In [51]:
def wc(text):
    """
    Cleaning function to be used with our first wordcloud
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('-','')
        tags = tags.replace('.','DOT')
        tags = tags.replace('c++','Cpp')
        tags = tags.replace('c#','Csharp')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def clean_tags(text):
    """
    Cleaning function for tags
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def tag_freq(data):
    tags = data['tags'].str.replace('[\["\]]', '', regex=True)
    tags = [tag for i in tags.apply(lambda x: wc(x)) for tag in i.split(', ')]
    result = FreqDist(tags)
    return result

def calculate_entropy(probabilities):
    """ Calculate the Shannon entropy of a given list of probabilities. """
    entropy = 0
    for p in probabilities:
        if p > 0:
            entropy += p * math.log(p, 2)
    return -entropy

In [None]:
year_month_day = df.year_month_day.unique()
# Extract keys througout the whole data
all_keys = tag_freq(questions)
all_keys = pd.DataFrame(all_keys, index = ['tag']).transpose().reset_index()
# compute tagShare on each month
for i in range(len(year_month_day)):
    data = questions[questions['year_month_day'] == year_month_day[i]]
    tags = tag_freq(data)
    tagCount = pd.DataFrame(tags, index = ['tag']).transpose().reset_index()
    tagShare = []
    for j in range(len(tagCount)):
        tagShare.append((tagCount['tag'][j] / tagCount['tag'].sum())*100)
    tagCount['tagShare'] = tagShare
    varName = year_month_day[i].replace('-', '_')
    tagCount = tagCount.rename(columns = {'tag':f'tag_{varName}','tagShare':f'tagShare_{varName}'})
    # merge here.
    all_keys = pd.merge(all_keys, tagCount, on = 'index', how = 'left')
# Save Data
with open(file = 'tagShare_old_vr2.pickle', mode = 'wb') as file:
    pickle.dump(all_keys, file)

In [49]:
with open(file = '/data1/StackOverflow/_Robustness/H4/tagShare_new_vr2.pickle', mode = 'rb') as file:
    all_keys = pickle.load(file)
all_keys

Unnamed: 0,index,tag,tag_2021_09_01,tagShare_2021_09_01,tag_2021_09_02,tagShare_2021_09_02,tag_2021_09_03,tagShare_2021_09_03,tag_2021_09_04,tagShare_2021_09_04,...,tag_2023_08_27,tagShare_2023_08_27,tag_2023_08_28,tagShare_2023_08_28,tag_2023_08_29,tagShare_2023_08_29,tag_2023_08_30,tagShare_2023_08_30,tag_2023_08_31,tagShare_2023_08_31
0,swift,13371,25.0,0.472500,18.0,0.316456,13.0,0.283533,15.0,0.443262,...,11.0,0.323054,20.0,0.355619,20.0,0.344116,21.0,0.332963,20.0,0.327118
1,python,246799,335.0,6.331506,398.0,6.997187,289.0,6.303162,219.0,6.471631,...,174.0,5.110132,287.0,5.103129,275.0,4.731590,294.0,4.661487,273.0,4.465162
2,discordDOTpy,4566,12.0,0.226800,9.0,0.158228,9.0,0.196292,2.0,0.059102,...,2.0,0.058737,6.0,0.106686,5.0,0.086029,3.0,0.047566,6.0,0.098135
3,texttospeech,284,2.0,0.037800,1.0,0.017581,,,,,...,,,,,1.0,0.017206,,,2.0,0.032712
4,r,47444,58.0,1.096201,81.0,1.424051,61.0,1.330425,32.0,0.945626,...,19.0,0.558003,26.0,0.462304,42.0,0.722643,55.0,0.872047,46.0,0.752372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41185,plasmo,1,,,,,,,,,...,,,,,,,,,1.0,0.016356
41186,mscapi,1,,,,,,,,,...,,,,,,,,,1.0,0.016356
41187,nbitcoin,1,,,,,,,,,...,,,,,,,,,1.0,0.016356
41188,useswr,1,,,,,,,,,...,,,,,,,,,1.0,0.016356


In [52]:
# Measure score
entropy_Score = []
# Calculate Entropy for each monthly tag share column.
for i in range(3, all_keys.shape[1], 2):
    arr = np.array(all_keys.iloc[:, i])
    arr = arr/100
    arrList = arr.tolist()
    entropy_Score.append(calculate_entropy(arrList))

In [66]:
df_final['entropy'] = entropy_Score
df_final['ln_entropy'] = np.log(df_final['entropy'])
df_final

Unnamed: 0,year_month_day,T_d,P_t,month,q,a,ln_q,ln_a,entropy,ln_entropy
0,2021-09-01,0,0,9,4677,6237,8.450412,8.738255,8.940674,2.190611
1,2021-09-02,0,0,9,4765,6372,8.469053,8.759669,8.933225,2.189777
2,2021-09-03,0,0,9,4077,5476,8.313117,8.608130,8.921938,2.188513
3,2021-09-04,0,0,9,2394,3233,7.780721,8.081166,8.578207,2.149225
4,2021-09-05,0,0,9,2385,3345,7.776954,8.115222,8.441587,2.133170
...,...,...,...,...,...,...,...,...,...,...
725,2023-08-27,1,1,8,1876,1601,7.536897,7.378384,9.108446,2.209202
726,2023-08-28,1,1,8,3283,2905,8.096513,7.974189,9.634171,2.265316
727,2023-08-29,1,1,8,3552,3050,8.175266,8.022897,9.638980,2.265815
728,2023-08-30,1,1,8,3773,3193,8.235626,8.068716,9.702986,2.272434


### 3. LSM

In [67]:
answers

Unnamed: 0,id,post_type,creation_date,owner_user_id,tags,body,year_month_day
0,69006420,answer,2021-09-01 00:00:18.070,7332046,,"<p>The short approach, remove python3.9 from y...",2021-09-01
1,69006421,answer,2021-09-01 00:00:28.823,14471093,,"<p>open the terminal, run as administrator.</p...",2021-09-01
2,69006422,answer,2021-09-01 00:00:35.537,523612,,"<p>Conceptually, <code>Base</code> has no reas...",2021-09-01
5,69006429,answer,2021-09-01 00:01:09.140,11269158,,<p><code>calldata</code> is a special data loc...,2021-09-01
6,69006430,answer,2021-09-01 00:01:17.920,2096113,,<p>You could do something like this if the str...,2021-09-01
...,...,...,...,...,...,...,...
5844589,77019850,answer,2023-08-31 23:56:07.187,313768,,<p>Make an inner solution routine that accepts...,2023-08-31
5844590,77019851,answer,2023-08-31 23:56:19.123,13296852,,<p>When you pass <code>stack[100]</code> as an...,2023-08-31
5844592,77019853,answer,2023-08-31 23:56:47.647,5103949,,"<p>I had the same error, I fixed it by followi...",2023-08-31
5844594,77019855,answer,2023-08-31 23:57:36.797,16844882,,"<p>In your Razor component, you can inject the...",2023-08-31


In [1]:
# Load Pickle
import pickle
import pandas as pd
import numpy as np

with open(file='/data1/StackOverflow/_Final/LSM_new_result.pickle', mode='rb') as f:
    lsm_og=pickle.load(f)

In [3]:
lsm_og

Unnamed: 0,owner_user_id,year_month,body,lsm_score
0,10000229,2021-09-01,<p>One thing that I've noticed in your code is...,0.157493
1,10002945,2021-09-01,<p>I am sure the above answer is very good and...,0.162539
2,10008173,2021-09-01,<p>Since you already know the location on the ...,0.341333
3,10008643,2021-09-01,<p>Replace &quot;justify-content-center&quot; ...,0.047047
4,10009777,2021-09-01,<p>(might need to adjust syntax according to y...,0.063511
...,...,...,...,...
2240706,9973516,2023-08-31,<p>Your <code>bf</code> struct contains only t...,0.203579
2240707,997358,2023-08-31,<p>A simple solution is readily obtained if th...,0.094842
2240708,997378,2023-08-31,<p>Use <code>!curl ipinfo.io</code> to check w...,0.079271
2240709,9977815,2023-08-31,<p>I finally figured it out.\nI had to use</p>...,0.068769


In [2]:
# Load user data with user_type information
user_df = pd.read_csv("split_power_casual.csv")
user_df

Unnamed: 0,owner_user_id,total_counts,user_type
0,100,2.000000,intensive
1,10000015,1.000000,casual
2,10000035,1.000000,casual
3,10000042,1.000000,casual
4,10000051,2.333333,intensive
...,...,...,...
1564962,9999918,1.000000,casual
1564963,9999935,4.000000,intensive
1564964,9999957,1.000000,casual
1564965,9999964,1.000000,casual


In [3]:
user_df['owner_user_id'] = user_df.owner_user_id.astype(str)

In [4]:
df_merge = pd.merge(lsm_og, user_df, on='owner_user_id', how = 'left')
df_merge

Unnamed: 0,owner_user_id,year_month,body,lsm_score,total_counts,user_type
0,10000229,2021-09-01,<p>One thing that I've noticed in your code is...,0.157493,3.100000,intensive
1,10002945,2021-09-01,<p>I am sure the above answer is very good and...,0.162539,1.500000,casual
2,10008173,2021-09-01,<p>Since you already know the location on the ...,0.341333,14.135088,top
3,10008643,2021-09-01,<p>Replace &quot;justify-content-center&quot; ...,0.047047,3.000000,intensive
4,10009777,2021-09-01,<p>(might need to adjust syntax according to y...,0.063511,1.875000,casual
...,...,...,...,...,...,...
2240706,9973516,2023-08-31,<p>Your <code>bf</code> struct contains only t...,0.203579,4.727273,intensive
2240707,997358,2023-08-31,<p>A simple solution is readily obtained if th...,0.094842,2.457237,intensive
2240708,997378,2023-08-31,<p>Use <code>!curl ipinfo.io</code> to check w...,0.079271,3.071429,intensive
2240709,9977815,2023-08-31,<p>I finally figured it out.\nI had to use</p>...,0.068769,1.583333,casual


In [9]:
df_merge.groupby('user_type').size().reset_index(name = 'val')

Unnamed: 0,user_type,val
0,casual,1183928
1,intensive,848236
2,top,208547


In [8]:
lsm_og['year_month'] = lsm_og['year_month'].astype(str)
lsm_og = lsm_og.rename(columns={'year_month':'year_month_day'})
lsm_og

Unnamed: 0,owner_user_id,year_month_day,body,lsm_score
0,10000229,2021-09-01,<p>One thing that I've noticed in your code is...,0.157493
1,10002945,2021-09-01,<p>I am sure the above answer is very good and...,0.162539
2,10008173,2021-09-01,<p>Since you already know the location on the ...,0.341333
3,10008643,2021-09-01,<p>Replace &quot;justify-content-center&quot; ...,0.047047
4,10009777,2021-09-01,<p>(might need to adjust syntax according to y...,0.063511
...,...,...,...,...
2240706,9973516,2023-08-31,<p>Your <code>bf</code> struct contains only t...,0.203579
2240707,997358,2023-08-31,<p>A simple solution is readily obtained if th...,0.094842
2240708,997378,2023-08-31,<p>Use <code>!curl ipinfo.io</code> to check w...,0.079271
2240709,9977815,2023-08-31,<p>I finally figured it out.\nI had to use</p>...,0.068769
