In [1]:
# Post Data
import pandas as pd
import numpy as np
import sqlite3
import pickle
import math
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, post_type, creation_date, owner_user_id, tags, body
FROM posts
WHERE creation_date > '2021-09-01'
AND creation_date < '2023-09-01';
'''
df = pd.read_sql_query(query, conn)
conn.close()

In [2]:
# Load user data with user_type information
user_df = pd.read_csv("split_power_casual.csv")
user_df

Unnamed: 0,owner_user_id,total_counts,user_type
0,100,2.000000,intensive
1,10000015,1.000000,casual
2,10000035,1.000000,casual
3,10000042,1.000000,casual
4,10000051,2.333333,intensive
...,...,...,...
1564962,9999918,1.000000,casual
1564963,9999935,4.000000,intensive
1564964,9999957,1.000000,casual
1564965,9999964,1.000000,casual


In [3]:
# Preprocess
df = df.dropna(subset=['owner_user_id'])
df['owner_user_id'] = df['owner_user_id'].astype(int).astype(str)
df['creation_date'] = pd.to_datetime(df['creation_date'])
df['year_month_day'] = df['creation_date'].dt.to_period('D')
df['year_month_day'] = df['year_month_day'].astype(str)

In [4]:
df

Unnamed: 0,id,post_type,creation_date,owner_user_id,tags,body,year_month_day
0,69006420,answer,2021-09-01 00:00:18.070,7332046,,"<p>The short approach, remove python3.9 from y...",2021-09-01
1,69006421,answer,2021-09-01 00:00:28.823,14471093,,"<p>open the terminal, run as administrator.</p...",2021-09-01
2,69006422,answer,2021-09-01 00:00:35.537,523612,,"<p>Conceptually, <code>Base</code> has no reas...",2021-09-01
3,69006423,question,2021-09-01 00:00:35.237,14087917,"[""node.js"", ""reactjs"", ""express"", ""axios"", ""re...",<p>Using interceptors for the first time to re...,2021-09-01
4,69006426,question,2021-09-01 00:00:55.583,13091928,"[""r"", ""shiny""]","<p>The APP below uses <code>navbarPage</code>,...",2021-09-01
...,...,...,...,...,...,...,...
5844592,77019853,answer,2023-08-31 23:56:47.647,5103949,,"<p>I had the same error, I fixed it by followi...",2023-08-31
5844593,77019854,question,2023-08-31 23:57:28.633,2532775,"[""angular"", ""typescript"", ""progressive-web-app...",<p>I utilized the guidelines presented in <a h...,2023-08-31
5844594,77019855,answer,2023-08-31 23:57:36.797,16844882,,"<p>In your Razor component, you can inject the...",2023-08-31
5844595,77019856,question,2023-08-31 23:58:56.693,2604570,"[""next.js""]","<p>I have the following structure, i am using ...",2023-08-31


In [5]:
user_df['owner_user_id'] = user_df['owner_user_id'].astype(str)

In [6]:
df_merge = pd.merge(df, user_df[['owner_user_id', 'user_type']], on = 'owner_user_id', how = 'left')
df_merge

Unnamed: 0,id,post_type,creation_date,owner_user_id,tags,body,year_month_day,user_type
0,69006420,answer,2021-09-01 00:00:18.070,7332046,,"<p>The short approach, remove python3.9 from y...",2021-09-01,casual
1,69006421,answer,2021-09-01 00:00:28.823,14471093,,"<p>open the terminal, run as administrator.</p...",2021-09-01,casual
2,69006422,answer,2021-09-01 00:00:35.537,523612,,"<p>Conceptually, <code>Base</code> has no reas...",2021-09-01,top
3,69006423,question,2021-09-01 00:00:35.237,14087917,"[""node.js"", ""reactjs"", ""express"", ""axios"", ""re...",<p>Using interceptors for the first time to re...,2021-09-01,casual
4,69006426,question,2021-09-01 00:00:55.583,13091928,"[""r"", ""shiny""]","<p>The APP below uses <code>navbarPage</code>,...",2021-09-01,intensive
...,...,...,...,...,...,...,...,...
5790172,77019853,answer,2023-08-31 23:56:47.647,5103949,,"<p>I had the same error, I fixed it by followi...",2023-08-31,casual
5790173,77019854,question,2023-08-31 23:57:28.633,2532775,"[""angular"", ""typescript"", ""progressive-web-app...",<p>I utilized the guidelines presented in <a h...,2023-08-31,casual
5790174,77019855,answer,2023-08-31 23:57:36.797,16844882,,"<p>In your Razor component, you can inject the...",2023-08-31,intensive
5790175,77019856,question,2023-08-31 23:58:56.693,2604570,"[""next.js""]","<p>I have the following structure, i am using ...",2023-08-31,casual


In [7]:
df_casual = df_merge[df_merge['user_type'] == 'casual']
df_intensive = df_merge[df_merge['user_type'] == 'intensive']
df_top = df_merge[df_merge['user_type'] == 'top']

In [28]:
questions = df_top[df_top['post_type'] == 'question']
questions

Unnamed: 0,id,post_type,creation_date,owner_user_id,tags,body,year_month_day,user_type
11,69006437,question,2021-09-01 00:03:18.750,2745485,"[""firebase"", ""nuxt.js"", ""firebase-hosting""]","<p>As you know, all firebase hosting is provid...",2021-09-01,top
76,69006520,question,2021-09-01 00:19:28.923,7360568,"[""slack"", ""slack-api""]","<p>According to <a href=""https://api.slack.com...",2021-09-01,top
104,69006556,question,2021-09-01 00:28:08.433,16800531,"[""html"", ""css"", ""css-grid""]",<p>I am attempting to use CSS grid (orderForm....,2021-09-01,top
153,69006622,question,2021-09-01 00:42:05.683,7778016,"[""python-3.x"", ""selenium""]",<p>I have few links with URLs and text inside ...,2021-09-01,top
404,69006970,question,2021-09-01 01:52:25.597,11267888,"[""javascript"", ""cart""]",<p>I am trying to have a discount applied to t...,2021-09-01,top
...,...,...,...,...,...,...,...,...
5789303,77018882,question,2023-08-31 19:53:55.183,12547996,"[""r""]","<p>Based on the data below, when I bring in a ...",2023-08-31,top
5789463,77019067,question,2023-08-31 20:24:58.437,22480098,"[""c++""]",<p>I cant put <code>cin&gt;&gt;x</code> in the...,2023-08-31,top
5789566,77019177,question,2023-08-31 20:45:22.110,15389244,"[""r"", ""dplyr""]",<p>I have data in R that looks like this:</p>\...,2023-08-31,top
5789638,77019255,question,2023-08-31 21:04:27.977,9998081,"[""php"", ""wordpress"", ""woocommerce"", ""discount""...",<p>I am adding a WooCommerce cart fee like so ...,2023-08-31,top


In [29]:
df_q = questions.groupby('year_month_day').size().reset_index(name = 'q')
df_q

Unnamed: 0,year_month_day,q
0,2021-09-01,85
1,2021-09-02,77
2,2021-09-03,60
3,2021-09-04,43
4,2021-09-05,45
...,...,...
725,2023-08-27,13
726,2023-08-28,35
727,2023-08-29,31
728,2023-08-30,45


In [31]:
year_month_day = questions.year_month_day.unique()
year_month_day

array(['2021-09-01', '2021-09-02', '2021-09-03', '2021-09-04',
       '2021-09-05', '2021-09-06', '2021-09-07', '2021-09-08',
       '2021-09-09', '2021-09-10', '2021-09-11', '2021-09-12',
       '2021-09-13', '2021-09-14', '2021-09-15', '2021-09-16',
       '2021-09-17', '2021-09-18', '2021-09-19', '2021-09-20',
       '2021-09-21', '2021-09-22', '2021-09-23', '2021-09-24',
       '2021-09-25', '2021-09-26', '2021-09-27', '2021-09-28',
       '2021-09-29', '2021-09-30', '2021-10-01', '2021-10-02',
       '2021-10-03', '2021-10-04', '2021-10-05', '2021-10-06',
       '2021-10-07', '2021-10-08', '2021-10-09', '2021-10-10',
       '2021-10-11', '2021-10-12', '2021-10-13', '2021-10-14',
       '2021-10-15', '2021-10-16', '2021-10-17', '2021-10-18',
       '2021-10-19', '2021-10-20', '2021-10-21', '2021-10-22',
       '2021-10-23', '2021-10-24', '2021-10-25', '2021-10-26',
       '2021-10-27', '2021-10-28', '2021-10-29', '2021-10-30',
       '2021-10-31', '2021-11-01', '2021-11-02', '2021-

In [35]:
all_keys = tag_freq(questions)
all_keys = pd.DataFrame(all_keys, index = ['tag']).transpose().reset_index()
all_keys

Unnamed: 0,index,tag
0,firebase,278
1,nuxtDOTjs,33
2,firebasehosting,3
3,slack,4
4,slackapi,2
...,...,...
8821,visualstudioextensions,1
8822,nothrow,1
8823,mpesa,1
8824,firebird1DOT5,1


In [36]:
for i in range(len(year_month_day)):
    target_data = questions[questions['year_month_day'] == year_month_day[i]]
    tags = tag_freq(target_data)
    tagCount = pd.DataFrame(tags, index = ['tag']).transpose().reset_index()
    tagShare = []
    for j in range(len(tagCount)):
        tagShare.append((tagCount['tag'][j] / tagCount['tag'].sum())*100)
    tagCount['tagShare'] = tagShare
    varName = year_month_day[i].replace('-', '_')
    tagCount = tagCount.rename(columns = {'tag':f'tag_{varName}','tagShare':f'tagShare_{varName}'})
    # merge here.
    all_keys = pd.merge(all_keys, tagCount, on = 'index', how = 'left')

In [37]:
all_keys

Unnamed: 0,index,tag,tag_2021_09_01,tagShare_2021_09_01,tag_2021_09_02,tagShare_2021_09_02,tag_2021_09_03,tagShare_2021_09_03,tag_2021_09_04,tagShare_2021_09_04,...,tag_2023_08_27,tagShare_2023_08_27,tag_2023_08_28,tagShare_2023_08_28,tag_2023_08_29,tagShare_2023_08_29,tag_2023_08_30,tagShare_2023_08_30,tag_2023_08_31,tagShare_2023_08_31
0,firebase,278,1.0,0.390625,2.0,0.826446,1.0,0.534759,,,...,1.0,2.222222,,,,,,,1.0,0.900901
1,nuxtDOTjs,33,1.0,0.390625,,,,,,,...,,,,,,,,,,
2,firebasehosting,3,1.0,0.390625,,,,,,,...,,,,,,,,,,
3,slack,4,1.0,0.390625,,,,,,,...,,,,,,,,,,
4,slackapi,2,1.0,0.390625,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8821,visualstudioextensions,1,,,,,,,,,...,,,,,,,1.0,0.70922,,
8822,nothrow,1,,,,,,,,,...,,,,,,,1.0,0.70922,,
8823,mpesa,1,,,,,,,,,...,,,,,,,,,1.0,0.900901
8824,firebird1DOT5,1,,,,,,,,,...,,,,,,,,,1.0,0.900901


In [39]:
# Measure score
entropy_Score = []
# Calculate Entropy for each monthly tag share column.
for i in range(3, all_keys.shape[1], 2):
    arr = np.array(all_keys.iloc[:, i])
    arr = arr/100
    arrList = arr.tolist()
    entropy_Score.append(calculate_entropy(arrList))

In [44]:
len(entropy_Score)

730

In [None]:
pg.27 첫번 쨰 스텝이 분해하는 것인데, 분해가 잘 되었다고는 어떻게 알 수 있나용?

In [45]:
casual = pd.read_csv('df_split_casual.csv')
casual

Unnamed: 0,year_month_day,T_d,P_t,month,q,a,ln_q,ln_a,entropy,ln_entropy,lsm,ln_lsm
0,2021-09-01,0,0,9,2652,0,7.883069,-inf,9.672691,2.269307,0.085186,-2.462922
1,2021-09-02,0,0,9,2675,0,7.891705,-inf,9.617068,2.263539,0.098764,-2.315024
2,2021-09-03,0,0,9,2338,0,7.757051,-inf,9.672048,2.269240,0.093233,-2.372651
3,2021-09-04,0,0,9,1313,0,7.180070,-inf,9.194534,2.218609,0.109986,-2.207402
4,2021-09-05,0,0,9,1343,0,7.202661,-inf,9.141328,2.212806,0.108032,-2.225332
...,...,...,...,...,...,...,...,...,...,...,...,...
725,2023-08-27,1,1,8,1308,0,7.176255,-inf,9.447730,2.245774,0.121113,-2.111031
726,2023-08-28,1,1,8,2266,0,7.725771,-inf,9.962066,2.298784,0.115723,-2.156557
727,2023-08-29,1,1,8,2418,0,7.790696,-inf,10.001658,2.302751,0.104343,-2.260069
728,2023-08-30,1,1,8,2532,0,7.836765,-inf,10.039628,2.306540,0.140746,-1.960796


In [46]:
intensive = pd.read_csv('df_split_intensive.csv')
intensive

Unnamed: 0,year_month_day,T_d,P_t,month,q,a,ln_q,ln_a,entropy,ln_entropy,lsm,ln_lsm
0,2021-09-01,0,0,9,1940,0,7.570443,-inf,8.966081,2.193449,0.110529,-2.202481
1,2021-09-02,0,0,9,2013,0,7.607381,-inf,8.991894,2.196324,0.122963,-2.095871
2,2021-09-03,0,0,9,1679,0,7.425954,-inf,9.002499,2.197502,0.116329,-2.151331
3,2021-09-04,0,0,9,1038,0,6.945051,-inf,8.479622,2.137666,0.132860,-2.018461
4,2021-09-05,0,0,9,997,0,6.904751,-inf,8.569897,2.148256,0.135432,-1.999285
...,...,...,...,...,...,...,...,...,...,...,...,...
725,2023-08-27,1,1,8,555,0,6.318968,-inf,8.699824,2.163303,0.146551,-1.920383
726,2023-08-28,1,1,8,982,0,6.889591,-inf,9.162927,2.215166,0.142119,-1.951092
727,2023-08-29,1,1,8,1103,0,7.005789,-inf,9.251995,2.224839,0.130770,-2.034318
728,2023-08-30,1,1,8,1196,0,7.086738,-inf,9.252655,2.224911,0.167037,-1.789539


In [47]:
top = pd.read_csv('df_split_top.csv')
top

Unnamed: 0,year_month_day,T_d,P_t,month,q,a,ln_q,ln_a,entropy,ln_entropy,lsm,ln_lsm
0,2021-09-01,0,0,9,85,0,4.442651,-inf,6.969697,1.941572,0.160669,-1.828408
1,2021-09-02,0,0,9,77,0,4.343805,-inf,6.981069,1.943202,0.182673,-1.700059
2,2021-09-03,0,0,9,60,0,4.094345,-inf,6.542010,1.878244,0.169282,-1.776189
3,2021-09-04,0,0,9,43,0,3.761200,-inf,6.250302,1.832630,0.183321,-1.696518
4,2021-09-05,0,0,9,45,0,3.806662,-inf,6.287710,1.838597,0.190067,-1.660378
...,...,...,...,...,...,...,...,...,...,...,...,...
725,2023-08-27,1,1,8,13,0,2.564949,-inf,5.102747,1.629779,0.180791,-1.710412
726,2023-08-28,1,1,8,35,0,3.555348,-inf,6.231867,1.829676,0.180781,-1.710467
727,2023-08-29,1,1,8,31,0,3.433987,-inf,5.705714,1.741468,0.163347,-1.811879
728,2023-08-30,1,1,8,45,0,3.806662,-inf,6.455705,1.864964,0.211216,-1.554874


In [58]:
intensive[intensive['entropy'] == 0]

Unnamed: 0,year_month_day,T_d,P_t,month,q,a,ln_q,ln_a,entropy,ln_entropy,lsm,ln_lsm


In [59]:
intensive.isna().sum()

year_month_day    0
T_d               0
P_t               0
month             0
q                 0
a                 0
ln_q              0
ln_a              0
entropy           0
ln_entropy        0
lsm               0
ln_lsm            0
dtype: int64

In [60]:
top.isna().sum()

year_month_day    0
T_d               0
P_t               0
month             0
q                 0
a                 0
ln_q              0
ln_a              0
entropy           0
ln_entropy        0
lsm               0
ln_lsm            0
dtype: int64

In [61]:
casual.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  diff_b_a = subtract(b, a)


Unnamed: 0,T_d,P_t,month,q,a,ln_q,ln_a,entropy,ln_entropy,lsm,ln_lsm
count,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0
mean,0.5,0.753425,6.526027,2261.423288,0.0,7.679317,-inf,9.656151,2.267201,0.104483,-2.266637
std,0.500343,0.431313,3.450215,630.954132,0.0,0.309726,,0.270293,0.028172,0.013354,0.125144
min,0.0,0.0,1.0,1009.0,0.0,6.916715,-inf,8.99122,2.196249,0.065848,-2.720403
25%,0.0,1.0,4.0,1635.0,0.0,7.399398,,9.48666,2.249887,0.095156,-2.352234
50%,0.5,1.0,7.0,2378.0,0.0,7.774015,,9.671182,2.269151,0.10277,-2.275264
75%,1.0,1.0,10.0,2666.5,0.0,7.888522,,9.907074,2.293249,0.112246,-2.187067
max,1.0,1.0,12.0,3701.0,0.0,8.216358,-inf,10.101883,2.312722,0.154568,-1.86712


In [62]:
top.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  diff_b_a = subtract(b, a)


Unnamed: 0,T_d,P_t,month,q,a,ln_q,ln_a,entropy,ln_entropy,lsm,ln_lsm
count,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0
mean,0.5,0.753425,6.526027,41.80411,0.0,3.642444,-inf,6.116709,1.807756,0.171856,-1.763665
std,0.500343,0.431313,3.450215,16.277725,0.0,0.455714,,0.476742,0.08254,0.012451,0.071421
min,0.0,0.0,1.0,5.0,0.0,1.609438,-inf,4.0,1.386294,0.138616,-1.976046
25%,0.0,1.0,4.0,29.0,0.0,3.367296,,5.899646,1.774892,0.163045,-1.81373
50%,0.5,1.0,7.0,42.0,0.0,3.73767,,6.20112,1.82473,0.170473,-1.76918
75%,1.0,1.0,10.0,53.0,0.0,3.970292,,6.438208,1.86225,0.179489,-1.717641
max,1.0,1.0,12.0,93.0,0.0,4.532599,-inf,6.999526,1.945842,0.226038,-1.487052


In [63]:
intensive.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  diff_b_a = subtract(b, a)


Unnamed: 0,T_d,P_t,month,q,a,ln_q,ln_a,entropy,ln_entropy,lsm,ln_lsm
count,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0
mean,0.5,0.753425,6.526027,1372.005479,0.0,7.168352,-inf,8.971294,2.193778,0.128957,-2.053776
std,0.500343,0.431313,3.450215,423.105687,0.0,0.349728,,0.200708,0.022534,0.013747,0.104246
min,0.0,0.0,1.0,497.0,0.0,6.20859,-inf,8.474723,2.137088,0.091784,-2.388315
25%,0.0,1.0,4.0,1015.5,0.0,6.923136,,8.8051,2.175331,0.119164,-2.127251
50%,0.5,1.0,7.0,1445.0,0.0,7.275864,,9.026382,2.200152,0.12688,-2.064517
75%,1.0,1.0,10.0,1760.25,0.0,7.473211,,9.114223,2.209836,0.136544,-1.991112
max,1.0,1.0,12.0,2072.0,0.0,7.63627,-inf,9.314384,2.23156,0.183544,-1.695302
