# Add 3 response(target) variables
- ln(Number of Comments)
- ln(Number of Answers per Question)
- ln(Comment per Post)

In [10]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pickle
import math
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, creation_date, answers, comments, owner_user_id
FROM questions
WHERE creation_date > '2021-09-01'
AND creation_date < '2023-09-01';
'''
df = pd.read_sql_query(query, conn)
conn.close()

In [11]:
df

Unnamed: 0,id,creation_date,answers,comments,owner_user_id
0,69006423,2021-09-01 00:00:35.237,1,0,14087917.0
1,69006426,2021-09-01 00:00:55.583,1,0,13091928.0
2,69006431,2021-09-01 00:01:23.670,1,2,8110267.0
3,69006437,2021-09-01 00:03:18.750,1,0,2745485.0
4,69006439,2021-09-01 00:03:36.930,1,2,12139975.0
...,...,...,...,...,...
2715204,77019848,2023-08-31 23:54:30.057,1,2,1006272.0
2715205,77019849,2023-08-31 23:55:21.660,0,0,15216800.0
2715206,77019852,2023-08-31 23:56:42.653,1,2,433202.0
2715207,77019854,2023-08-31 23:57:28.633,0,2,2532775.0


In [6]:
df.describe()

Unnamed: 0,id,answers,comments,owner_user_id
count,2715209.0,2715209.0,2715209.0,2682920.0
mean,73223870.0,0.9339351,1.844893,12543790.0
std,2333213.0,0.8636178,2.554058,6686994.0
min,69006420.0,0.0,0.0,29.0
25%,71202000.0,0.0,0.0,6691575.0
50%,73388740.0,1.0,1.0,13973840.0
75%,75255790.0,1.0,3.0,18269460.0
max,77019860.0,60.0,62.0,22480650.0


### 1) Number of Comments

In [7]:
# Load Data (preChatGPT)
with open(file = 'ques_df_pre.pickle', mode = 'rb') as file:
    ques_df = pickle.load(file)

In [12]:
df_merge = pd.merge(ques_df, df, on = ['id', 'creation_date'], how = 'left')
df_merge

Unnamed: 0,id,creation_date,body,tags,community,full_body,code_body,answers,comments,owner_user_id
0,69006423,2021-09-01 00:00:35.237,<p>Using interceptors for the first time to re...,"[nodeDOTjs, reactjs, express, axios, refreshto...",0,Using interceptors for the first time to refre...,,1,0,14087917.0
1,69006426,2021-09-01 00:00:55.583,"<p>The APP below uses <code>navbarPage</code>,...","[r, shiny]",17,"The APP below uses navbarPage, however I will ...",dashboardPage\nnavbarPage\nlibrary(shiny)\nlib...,1,0,13091928.0
2,69006431,2021-09-01 00:01:23.670,<p>I am developing a java project and every th...,"[java, installation, compilation, project, ope...",3,I am developing a java project and every thing...,,1,2,8110267.0
3,69006437,2021-09-01 00:03:18.750,"<p>As you know, all firebase hosting is provid...","[firebase, nuxtDOTjs, firebasehosting]",5,"As you know, all firebase hosting is provided ...","[\n '@nuxtjs/firebase',\n {\n ...",1,0,2745485.0
4,69006439,2021-09-01 00:03:36.930,<p>I am trying to split the string as below</p...,"[r, dataDOTtable]",17,I am trying to split the string as below\nx <-...,"x <- data.table(VAROLD=c('DBA','ADB'))\nx[, pa...",1,2,12139975.0
...,...,...,...,...,...,...,...,...,...,...
2713807,77019848,2023-08-31 23:54:30.057,<p>I need help solving a CORS issue.</p>\n<p>I...,"[docker, dockercompose, owaspdependencycheck, ...",7,I need help solving a CORS issue.\nI am trying...,- API_BASE_URL=http://XXX.XXX.XX.XX:8081\nAcce...,1,2,1006272.0
2713808,77019849,2023-08-31 23:55:21.660,<p>I am currently attempting to search a Share...,"[microsoftgraphapi, sharepointonline]",1,I am currently attempting to search a SharePoi...,"{\n ""requests"": [\n {\n ...",0,0,15216800.0
2713809,77019852,2023-08-31 23:56:42.653,"<p>Similar to <a href=""https://stackoverflow.c...","[python, numpy, floatingpoint]",17,Similar to Numpy astype rounding to wrong valu...,In [2]: import numpy as np\n\n...\n\nIn [49]: ...,1,2,433202.0
2713810,77019854,2023-08-31 23:57:28.633,<p>I utilized the guidelines presented in <a h...,"[angular, typescript, progressivewebapps, ngsw...",0,I utilized the guidelines presented in this ar...,import {BrowserModule} from '@angular/platform...,0,2,2532775.0


In [14]:
#create daily timeseries data
df_merge['year_month_day'] = pd.to_datetime(df_merge['creation_date']).dt.to_period('D') # daily
df_merge['year_month_day'] = df_merge['year_month_day'].astype(str)

In [17]:
numComments = df_merge.groupby(['year_month_day', 'community'])['comments'].sum().reset_index()
numComments

Unnamed: 0,year_month_day,community,comments
0,2021-09-01,0,2291
1,2021-09-01,1,938
2,2021-09-01,3,841
3,2021-09-01,4,1365
4,2021-09-01,5,569
...,...,...,...
10891,2023-08-31,11,22
10892,2023-08-31,13,39
10893,2023-08-31,14,1
10894,2023-08-31,15,2


### 2) Avg Number of Answers per Question

In [35]:
# use average
AnsPerQ = df_merge.groupby(['year_month_day', 'community'])['answers'].mean().reset_index(name = 'AnsPerQ')
AnsPerQ

Unnamed: 0,year_month_day,community,AnsPerQ
0,2021-09-01,0,1.164056
1,2021-09-01,1,1.032419
2,2021-09-01,3,1.010152
3,2021-09-01,4,1.141304
4,2021-09-01,5,1.190476
...,...,...,...
10891,2023-08-31,11,0.666667
10892,2023-08-31,13,0.413793
10893,2023-08-31,14,0.000000
10894,2023-08-31,15,0.600000


In [36]:
AnsPerQ.describe()

Unnamed: 0,community,AnsPerQ
count,10896.0,10896.0
mean,8.192823,0.906343
std,5.015349,0.24664
min,0.0,0.0
25%,4.0,0.755102
50%,8.0,0.921348
75%,13.0,1.05
max,17.0,4.0


### 3) Avg Number of Comments per Question

In [34]:
ComPerQ = df_merge.groupby(['year_month_day', 'community'])['comments'].mean().reset_index(name = 'CommentsPerQ')
ComPerQ

Unnamed: 0,year_month_day,community,CommentsPerQ
0,2021-09-01,0,1.888706
1,2021-09-01,1,2.339152
2,2021-09-01,3,2.134518
3,2021-09-01,4,3.709239
4,2021-09-01,5,1.505291
...,...,...,...
10891,2023-08-31,11,1.222222
10892,2023-08-31,13,1.344828
10893,2023-08-31,14,0.125000
10894,2023-08-31,15,0.400000


In [37]:
ComPerQ.describe()

Unnamed: 0,community,CommentsPerQ
count,10896.0,10896.0
mean,8.192823,1.6302
std,5.015349,0.761821
min,0.0,0.0
25%,4.0,1.200718
50%,8.0,1.645863
75%,13.0,2.034106
max,17.0,11.0


### 4) Merge all three data

In [40]:
df_final = pd.merge(numComments, AnsPerQ, on = ['year_month_day', 'community'])
df_final = pd.merge(df_final, ComPerQ, on = ['year_month_day', 'community'])
df_final

Unnamed: 0,year_month_day,community,comments,AnsPerQ,CommentsPerQ
0,2021-09-01,0,2291,1.164056,1.888706
1,2021-09-01,1,938,1.032419,2.339152
2,2021-09-01,3,841,1.010152,2.134518
3,2021-09-01,4,1365,1.141304,3.709239
4,2021-09-01,5,569,1.190476,1.505291
...,...,...,...,...,...
10891,2023-08-31,11,22,0.666667,1.222222
10892,2023-08-31,13,39,0.413793,1.344828
10893,2023-08-31,14,1,0.000000,0.125000
10894,2023-08-31,15,2,0.600000,0.400000


In [64]:
df_final_pre4 = pd.read_csv("df_final_pre4.csv")
df_final_pre5 = pd.merge(df_final_pre4, df_final, on = ['year_month_day', 'community'], how = 'left')
df_final_pre5

Unnamed: 0,year_month_day,T_d,P_t,month,community,techiness,entropy,count_q,count_a,ln_q,ln_a,ln_entropy,year_month,numUser,ln_numUser,comments,AnsPerQ,CommentsPerQ
0,2021-09-01,0,0,9,0,0.525040,7.298398,1213.0,1029.0,7.100852,6.936343,1.987655,2021-09,22317,10.013104,2291,1.164056,1.888706
1,2021-09-01,0,0,9,1,0.412791,7.399966,401.0,225.0,5.993961,5.416100,2.001475,2021-09,7820,8.964440,938,1.032419,2.339152
2,2021-09-01,0,0,9,3,0.490224,7.231138,394.0,250.0,5.976351,5.521461,1.978396,2021-09,7714,8.950792,841,1.010152,2.134518
3,2021-09-01,0,0,9,4,0.439161,7.709924,368.0,305.0,5.908083,5.720312,2.042508,2021-09,7720,8.951570,1365,1.141304,3.709239
4,2021-09-01,0,0,9,5,0.461350,7.043218,378.0,258.0,5.934894,5.552960,1.952065,2021-09,7509,8.923858,569,1.190476,1.505291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10707,2023-08-31,1,1,8,11,0.476285,4.175869,18.0,12.0,2.890372,2.484907,1.429322,2023-08,343,5.837730,22,0.666667,1.222222
10708,2023-08-31,1,1,8,13,0.421145,5.528088,29.0,18.0,3.367296,2.890372,1.709842,2023-08,630,6.445720,39,0.413793,1.344828
10709,2023-08-31,1,1,8,14,0.403302,3.344698,8.0,1.0,2.079442,0.000000,1.207377,2023-08,278,5.627621,1,0.000000,0.125000
10710,2023-08-31,1,1,8,15,0.307775,2.947703,5.0,4.0,1.609438,1.386294,1.081026,2023-08,122,4.804021,2,0.600000,0.400000


### Check 0s in Comments, AnsPerQ, and CommentsPerQ

In [65]:
print("Number of 0s in Comments: ", len(df_final_pre5[df_final_pre5['comments'] == 0]))

Number of 0s in Comments:  214


In [66]:
print("Number of 0s in # Ans per Q: ", len(df_final_pre5[df_final_pre5['AnsPerQ'] == 0]))

Number of 0s in # Ans per Q:  33


In [67]:
print("Number of 0s in # Comments per Q: ", len(df_final_pre5[df_final_pre5['CommentsPerQ'] == 0]))

Number of 0s in # Comments per Q:  214


In [68]:
df_final_pre5.groupby('community')['comments'].apply(lambda x: (x != 0).sum()).reset_index()

Unnamed: 0,community,comments
0,0,730
1,1,730
2,3,730
3,4,730
4,5,730
5,6,730
6,7,730
7,8,730
8,9,730
9,10,444


In [59]:
df_final_pre5.groupby('community')['AnsPerQ'].apply(lambda x: (x != 0).sum()).reset_index()

Unnamed: 0,community,AnsPerQ
0,0,730
1,1,730
2,3,730
3,4,730
4,5,730
5,6,730
6,7,730
7,8,730
8,9,730
9,10,534


In [60]:
df_final_pre5.groupby('community')['CommentsPerQ'].apply(lambda x: (x != 0).sum()).reset_index()

Unnamed: 0,community,CommentsPerQ
0,0,730
1,1,730
2,3,730
3,4,730
4,5,730
5,6,730
6,7,730
7,8,730
8,9,730
9,10,444


In [73]:
# Remove rows with 0 value (10,712->10,474)
df_final_pre5 = df_final_pre5[(df_final_pre5['comments'] != 0)&(df_final_pre5['AnsPerQ'] != 0)&(df_final_pre5['CommentsPerQ'] != 0)]
df_final_pre5

Unnamed: 0,year_month_day,T_d,P_t,month,community,techiness,entropy,count_q,count_a,ln_q,ln_a,ln_entropy,year_month,numUser,ln_numUser,comments,AnsPerQ,CommentsPerQ
0,2021-09-01,0,0,9,0,0.525040,7.298398,1213.0,1029.0,7.100852,6.936343,1.987655,2021-09,22317,10.013104,2291,1.164056,1.888706
1,2021-09-01,0,0,9,1,0.412791,7.399966,401.0,225.0,5.993961,5.416100,2.001475,2021-09,7820,8.964440,938,1.032419,2.339152
2,2021-09-01,0,0,9,3,0.490224,7.231138,394.0,250.0,5.976351,5.521461,1.978396,2021-09,7714,8.950792,841,1.010152,2.134518
3,2021-09-01,0,0,9,4,0.439161,7.709924,368.0,305.0,5.908083,5.720312,2.042508,2021-09,7720,8.951570,1365,1.141304,3.709239
4,2021-09-01,0,0,9,5,0.461350,7.043218,378.0,258.0,5.934894,5.552960,1.952065,2021-09,7509,8.923858,569,1.190476,1.505291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10705,2023-08-31,1,1,8,9,0.541652,4.639900,31.0,16.0,3.433987,2.772589,1.534693,2023-08,510,6.234411,47,0.612903,1.516129
10707,2023-08-31,1,1,8,11,0.476285,4.175869,18.0,12.0,2.890372,2.484907,1.429322,2023-08,343,5.837730,22,0.666667,1.222222
10708,2023-08-31,1,1,8,13,0.421145,5.528088,29.0,18.0,3.367296,2.890372,1.709842,2023-08,630,6.445720,39,0.413793,1.344828
10710,2023-08-31,1,1,8,15,0.307775,2.947703,5.0,4.0,1.609438,1.386294,1.081026,2023-08,122,4.804021,2,0.600000,0.400000


In [81]:
df_final_pre5['ln_comments'] = np.log(df_final_pre5['comments'])
df_final_pre5['ln_AnsPerQ'] = np.log(df_final_pre5['AnsPerQ'])
df_final_pre5['ln_CommentsPerQ'] = np.log(df_final_pre5['CommentsPerQ'])
df_final_pre5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_pre5['ln_comments'] = np.log(df_final_pre5['comments'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_pre5['ln_AnsPerQ'] = np.log(df_final_pre5['AnsPerQ'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_pre5['ln_CommentsPerQ'] = np.log(df_final_pre5['CommentsPerQ'])


Unnamed: 0,year_month_day,T_d,P_t,month,community,techiness,entropy,count_q,count_a,ln_q,...,ln_entropy,year_month,numUser,ln_numUser,comments,AnsPerQ,CommentsPerQ,ln_comments,ln_AnsPerQ,ln_CommentsPerQ
0,2021-09-01,0,0,9,0,0.525040,7.298398,1213.0,1029.0,7.100852,...,1.987655,2021-09,22317,10.013104,2291,1.164056,1.888706,7.736744,0.151911,0.635892
1,2021-09-01,0,0,9,1,0.412791,7.399966,401.0,225.0,5.993961,...,2.001475,2021-09,7820,8.964440,938,1.032419,2.339152,6.843750,0.031905,0.849789
2,2021-09-01,0,0,9,3,0.490224,7.231138,394.0,250.0,5.976351,...,1.978396,2021-09,7714,8.950792,841,1.010152,2.134518,6.734592,0.010101,0.758241
3,2021-09-01,0,0,9,4,0.439161,7.709924,368.0,305.0,5.908083,...,2.042508,2021-09,7720,8.951570,1365,1.141304,3.709239,7.218910,0.132172,1.310827
4,2021-09-01,0,0,9,5,0.461350,7.043218,378.0,258.0,5.934894,...,1.952065,2021-09,7509,8.923858,569,1.190476,1.505291,6.343880,0.174353,0.408986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10705,2023-08-31,1,1,8,9,0.541652,4.639900,31.0,16.0,3.433987,...,1.534693,2023-08,510,6.234411,47,0.612903,1.516129,3.850148,-0.489548,0.416160
10707,2023-08-31,1,1,8,11,0.476285,4.175869,18.0,12.0,2.890372,...,1.429322,2023-08,343,5.837730,22,0.666667,1.222222,3.091042,-0.405465,0.200671
10708,2023-08-31,1,1,8,13,0.421145,5.528088,29.0,18.0,3.367296,...,1.709842,2023-08,630,6.445720,39,0.413793,1.344828,3.663562,-0.882389,0.296266
10710,2023-08-31,1,1,8,15,0.307775,2.947703,5.0,4.0,1.609438,...,1.081026,2023-08,122,4.804021,2,0.600000,0.400000,0.693147,-0.510826,-0.916291


In [85]:
df_final_pre5[['ln_comments', 'ln_AnsPerQ', 'ln_CommentsPerQ']].describe()

Unnamed: 0,ln_comments,ln_AnsPerQ,ln_CommentsPerQ
count,10474.0,10474.0,10474.0
mean,4.964032,-0.127464,0.39977
std,2.007829,0.254953,0.547408
min,0.0,-2.197225,-2.890372
25%,3.433987,-0.276538,0.233452
50%,5.631212,-0.08591,0.510826
75%,6.647364,0.051293,0.716883
max,7.824846,1.386294,2.197225


In [86]:
# Save Data
df_final_pre5.to_csv('df_final_pre5.csv', index=False)