# Post Clustering using Pre-computed tag clusters (Jun 28)
- 1) Extract tag information from data (2021-09-01 ~ 2023-08-31)
- 2) Load louvein_community.pickle to get cluster information of tags.
- 3) When a post is composed with multiple tags, assign the post into a cluster that has the majority of tags.

In [1]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
from nltk import FreqDist
import pickle
import math
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, creation_date, body, tags
FROM questions
WHERE creation_date > '2021-09-01' AND
creation_date < '2023-09-01';
'''
df_tags = pd.read_sql_query(query, conn)
conn.close()

In [2]:
df_tags

Unnamed: 0,id,creation_date,body,tags
0,69006423,2021-09-01 00:00:35.237,<p>Using interceptors for the first time to re...,"[""node.js"", ""reactjs"", ""express"", ""axios"", ""re..."
1,69006426,2021-09-01 00:00:55.583,"<p>The APP below uses <code>navbarPage</code>,...","[""r"", ""shiny""]"
2,69006431,2021-09-01 00:01:23.670,<p>I am developing a java project and every th...,"[""java"", ""installation"", ""compilation"", ""proje..."
3,69006437,2021-09-01 00:03:18.750,"<p>As you know, all firebase hosting is provid...","[""firebase"", ""nuxt.js"", ""firebase-hosting""]"
4,69006439,2021-09-01 00:03:36.930,<p>I am trying to split the string as below</p...,"[""r"", ""data.table""]"
...,...,...,...,...
2715204,77019848,2023-08-31 23:54:30.057,<p>I need help solving a CORS issue.</p>\n<p>I...,"[""docker"", ""docker-compose"", ""owasp-dependency..."
2715205,77019849,2023-08-31 23:55:21.660,<p>I am currently attempting to search a Share...,"[""microsoft-graph-api"", ""sharepoint-online""]"
2715206,77019852,2023-08-31 23:56:42.653,"<p>Similar to <a href=""https://stackoverflow.c...","[""python"", ""numpy"", ""floating-point""]"
2715207,77019854,2023-08-31 23:57:28.633,<p>I utilized the guidelines presented in <a h...,"[""angular"", ""typescript"", ""progressive-web-app..."


In [3]:
import pickle
with open(file = 'louvain_community_pre.pickle', mode = 'rb') as file:
    df_clusters = pickle.load(file)

In [4]:
df_clusters

Unnamed: 0,tag,community,tag_count
0,nodeDOTjs,0,82196
1,reactjs,0,155462
2,express,0,18287
3,axios,0,8260
4,refreshtoken,1,369
...,...,...,...
42921,mavenindexer,3,1
42922,irvine16,4,1
42923,aif,1,2
42924,securitystamp,1,2


In [5]:
# Preprocess
def wc(text):
    """
    Cleaning function to be used with our first wordcloud
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('-','')
        tags = tags.replace('.','DOT')
        tags = tags.replace('c++','Cpp')
        tags = tags.replace('c#','Csharp')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def clean_tags(text):
    """
    Cleaning function for tags
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def tag_freq(data):
    tags = data['tags'].str.replace('[\["\]]', '', regex=True)
    tags = [tag for i in tags.apply(lambda x: wc(x)) for tag in i.split(', ')]
    result = FreqDist(tags)
    return result
# Tags Preprocessing
df_tags['tags'] = df_tags['tags'].str.replace('[\["\]]', '', regex=True)
df_tags['tags'] = df_tags['tags'].str.replace('c#','Csharp')
df_tags['tags'] = df_tags['tags'].str.replace('c++','Cpp')
df_tags['tags'] = df_tags['tags'].str.replace('.','DOT')
df_tags['tags'] = df_tags['tags'].str.replace('><',' ')
df_tags['tags'] = df_tags['tags'].str.replace('>','')
df_tags['tags'] = df_tags['tags'].str.replace('-','')
df_tags['tags'] = df_tags['tags'].str.replace('"', '')

df_tags = df_tags.reset_index(drop = True)
df_tags['tags'] = df_tags['tags'].str.replace(',', ' ')
df_tags['tags'] = df_tags['tags'].str.split()

In [6]:
df_tags

Unnamed: 0,id,creation_date,body,tags
0,69006423,2021-09-01 00:00:35.237,<p>Using interceptors for the first time to re...,"[nodeDOTjs, reactjs, express, axios, refreshto..."
1,69006426,2021-09-01 00:00:55.583,"<p>The APP below uses <code>navbarPage</code>,...","[r, shiny]"
2,69006431,2021-09-01 00:01:23.670,<p>I am developing a java project and every th...,"[java, installation, compilation, project, ope..."
3,69006437,2021-09-01 00:03:18.750,"<p>As you know, all firebase hosting is provid...","[firebase, nuxtDOTjs, firebasehosting]"
4,69006439,2021-09-01 00:03:36.930,<p>I am trying to split the string as below</p...,"[r, dataDOTtable]"
...,...,...,...,...
2715204,77019848,2023-08-31 23:54:30.057,<p>I need help solving a CORS issue.</p>\n<p>I...,"[docker, dockercompose, owaspdependencycheck, ..."
2715205,77019849,2023-08-31 23:55:21.660,<p>I am currently attempting to search a Share...,"[microsoftgraphapi, sharepointonline]"
2715206,77019852,2023-08-31 23:56:42.653,"<p>Similar to <a href=""https://stackoverflow.c...","[python, numpy, floatingpoint]"
2715207,77019854,2023-08-31 23:57:28.633,<p>I utilized the guidelines presented in <a h...,"[angular, typescript, progressivewebapps, ngsw..."


In [7]:
# Only leave the first element to compare with the cluster information
df_tags['tag'] = df_tags['tags'].apply(lambda x : x[0])

In [8]:
df_tags

Unnamed: 0,id,creation_date,body,tags,tag
0,69006423,2021-09-01 00:00:35.237,<p>Using interceptors for the first time to re...,"[nodeDOTjs, reactjs, express, axios, refreshto...",nodeDOTjs
1,69006426,2021-09-01 00:00:55.583,"<p>The APP below uses <code>navbarPage</code>,...","[r, shiny]",r
2,69006431,2021-09-01 00:01:23.670,<p>I am developing a java project and every th...,"[java, installation, compilation, project, ope...",java
3,69006437,2021-09-01 00:03:18.750,"<p>As you know, all firebase hosting is provid...","[firebase, nuxtDOTjs, firebasehosting]",firebase
4,69006439,2021-09-01 00:03:36.930,<p>I am trying to split the string as below</p...,"[r, dataDOTtable]",r
...,...,...,...,...,...
2715204,77019848,2023-08-31 23:54:30.057,<p>I need help solving a CORS issue.</p>\n<p>I...,"[docker, dockercompose, owaspdependencycheck, ...",docker
2715205,77019849,2023-08-31 23:55:21.660,<p>I am currently attempting to search a Share...,"[microsoftgraphapi, sharepointonline]",microsoftgraphapi
2715206,77019852,2023-08-31 23:56:42.653,"<p>Similar to <a href=""https://stackoverflow.c...","[python, numpy, floatingpoint]",python
2715207,77019854,2023-08-31 23:57:28.633,<p>I utilized the guidelines presented in <a h...,"[angular, typescript, progressivewebapps, ngsw...",angular


In [9]:
# merge with df_clusters
df_post_cluster = pd.merge(df_tags, df_clusters, on = 'tag', how = 'left')

In [10]:
# preProcess (erase columns = tag, tag_count / community feature as an integer)
df_post_cluster = df_post_cluster.drop(['tag', 'tag_count'], axis = 1)

In [11]:
df_post_cluster

Unnamed: 0,id,creation_date,body,tags,community
0,69006423,2021-09-01 00:00:35.237,<p>Using interceptors for the first time to re...,"[nodeDOTjs, reactjs, express, axios, refreshto...",0.0
1,69006426,2021-09-01 00:00:55.583,"<p>The APP below uses <code>navbarPage</code>,...","[r, shiny]",17.0
2,69006431,2021-09-01 00:01:23.670,<p>I am developing a java project and every th...,"[java, installation, compilation, project, ope...",3.0
3,69006437,2021-09-01 00:03:18.750,"<p>As you know, all firebase hosting is provid...","[firebase, nuxtDOTjs, firebasehosting]",5.0
4,69006439,2021-09-01 00:03:36.930,<p>I am trying to split the string as below</p...,"[r, dataDOTtable]",17.0
...,...,...,...,...,...
2715204,77019848,2023-08-31 23:54:30.057,<p>I need help solving a CORS issue.</p>\n<p>I...,"[docker, dockercompose, owaspdependencycheck, ...",7.0
2715205,77019849,2023-08-31 23:55:21.660,<p>I am currently attempting to search a Share...,"[microsoftgraphapi, sharepointonline]",1.0
2715206,77019852,2023-08-31 23:56:42.653,"<p>Similar to <a href=""https://stackoverflow.c...","[python, numpy, floatingpoint]",17.0
2715207,77019854,2023-08-31 23:57:28.633,<p>I utilized the guidelines presented in <a h...,"[angular, typescript, progressivewebapps, ngsw...",0.0


### There are 1397 rows that do not belong to a cluster as they are newly introduced.

In [12]:
df_post_cluster['community'].isna().sum()

1397

In [13]:
df_post_cluster[df_post_cluster.isna().any(axis=1)]

Unnamed: 0,id,creation_date,body,tags,community
59,69006596,2021-09-01 00:36:13.333,<p>Let's say I want to write a code that greet...,[gf],
5851,69026138,2021-09-02 07:46:52.303,<p>I have a JSON formatted input in the below ...,[klipfolio],
6600,69028550,2021-09-02 10:30:33.247,<p>How can I change my code M2DOC code for exp...,[m2doc],
15636,69057679,2021-09-04 17:44:02.773,<p>I'm going through the fstar tutorial using ...,"[fstar, fstarmode]",
20693,69074699,2021-09-06 12:38:06.833,<p>Can we remove the tracking urls parameters ...,[wordlift],
...,...,...,...,...,...
2712147,77013484,2023-08-31 07:00:18.180,<p>I am trying to render component using vites...,[nuxt3],
2712764,77014769,2023-08-31 09:57:56.440,<p>One in golscript can view the stack easily ...,[golfscript],
2712869,77014979,2023-08-31 10:25:35.550,"<p>According to this page <a href=""https://dev...",[googleroutesapi],
2713554,77016381,2023-08-31 13:37:49.923,<p>I'm using vector.dev to send logs to Parsea...,[vectordotdev],


### Data Preparation for Sub-Communities Analysis
- Features in Need from Questions Data (ID, CreationDate, tags, community)
- Features in Need from Answers Data (ID, creation_date, owner_user_id, body, community)

In [14]:
# 1) Erase tags that are not assigned to a community
df_post_cluster = df_post_cluster.dropna().reset_index(drop = True)
df_post_cluster['community'] = df_post_cluster['community'].astype(int)

In [15]:
# Final Version of data.
df_post_cluster

Unnamed: 0,id,creation_date,body,tags,community
0,69006423,2021-09-01 00:00:35.237,<p>Using interceptors for the first time to re...,"[nodeDOTjs, reactjs, express, axios, refreshto...",0
1,69006426,2021-09-01 00:00:55.583,"<p>The APP below uses <code>navbarPage</code>,...","[r, shiny]",17
2,69006431,2021-09-01 00:01:23.670,<p>I am developing a java project and every th...,"[java, installation, compilation, project, ope...",3
3,69006437,2021-09-01 00:03:18.750,"<p>As you know, all firebase hosting is provid...","[firebase, nuxtDOTjs, firebasehosting]",5
4,69006439,2021-09-01 00:03:36.930,<p>I am trying to split the string as below</p...,"[r, dataDOTtable]",17
...,...,...,...,...,...
2713807,77019848,2023-08-31 23:54:30.057,<p>I need help solving a CORS issue.</p>\n<p>I...,"[docker, dockercompose, owaspdependencycheck, ...",7
2713808,77019849,2023-08-31 23:55:21.660,<p>I am currently attempting to search a Share...,"[microsoftgraphapi, sharepointonline]",1
2713809,77019852,2023-08-31 23:56:42.653,"<p>Similar to <a href=""https://stackoverflow.c...","[python, numpy, floatingpoint]",17
2713810,77019854,2023-08-31 23:57:28.633,<p>I utilized the guidelines presented in <a h...,"[angular, typescript, progressivewebapps, ngsw...",0


In [16]:
# Save data
#with open(file = 'post_cluster_pre.pickle', mode = 'wb') as file:
#    pickle.dump(df_post_cluster, file)

In [None]:
import pickle
with open(file = 'post_cluster_pre.pickle', mode = 'rb') as file:
    data = pickle.load(file)

### For answers data

In [17]:
# Import Answers Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, creation_date, parent_id, body
FROM answers
WHERE creation_date > '2021-09-01' AND
creation_date < '2023-09-01';
'''
df_ans = pd.read_sql_query(query, conn)
conn.close()

In [18]:
df_ans

Unnamed: 0,id,creation_date,parent_id,body
0,69006420,2021-09-01 00:00:18.070,69006229,"<p>The short approach, remove python3.9 from y..."
1,69006421,2021-09-01 00:00:28.823,68746577,"<p>open the terminal, run as administrator.</p..."
2,69006422,2021-09-01 00:00:35.537,69006293,"<p>Conceptually, <code>Base</code> has no reas..."
3,69006429,2021-09-01 00:01:09.140,68997666,<p><code>calldata</code> is a special data loc...
4,69006430,2021-09-01 00:01:17.920,69006320,<p>You could do something like this if the str...
...,...,...,...,...
3124928,77019850,2023-08-31 23:56:07.187,77013377,<p>Make an inner solution routine that accepts...
3124929,77019851,2023-08-31 23:56:19.123,77019825,<p>When you pass <code>stack[100]</code> as an...
3124930,77019853,2023-08-31 23:56:47.647,76922631,"<p>I had the same error, I fixed it by followi..."
3124931,77019855,2023-08-31 23:57:36.797,77019829,"<p>In your Razor component, you can inject the..."


In [19]:
df_ans_cluster = pd.merge(df_ans, df_post_cluster, how = 'left', left_on = 'parent_id', right_on = 'id')
df_ans_cluster = df_ans_cluster[['id_x', 'creation_date_x', 'parent_id', 'body_x', 'community']]
df_ans_cluster.columns = ['id', 'creation_date', 'parent_id', 'body', 'community']
df_ans_cluster

Unnamed: 0,id,creation_date,parent_id,body,community
0,69006420,2021-09-01 00:00:18.070,69006229,"<p>The short approach, remove python3.9 from y...",
1,69006421,2021-09-01 00:00:28.823,68746577,"<p>open the terminal, run as administrator.</p...",
2,69006422,2021-09-01 00:00:35.537,69006293,"<p>Conceptually, <code>Base</code> has no reas...",
3,69006429,2021-09-01 00:01:09.140,68997666,<p><code>calldata</code> is a special data loc...,
4,69006430,2021-09-01 00:01:17.920,69006320,<p>You could do something like this if the str...,
...,...,...,...,...,...
3124928,77019850,2023-08-31 23:56:07.187,77013377,<p>Make an inner solution routine that accepts...,17.0
3124929,77019851,2023-08-31 23:56:19.123,77019825,<p>When you pass <code>stack[100]</code> as an...,17.0
3124930,77019853,2023-08-31 23:56:47.647,76922631,"<p>I had the same error, I fixed it by followi...",5.0
3124931,77019855,2023-08-31 23:57:36.797,77019829,"<p>In your Razor component, you can inject the...",1.0


In [20]:
df_ans_cluster['community'].isna().sum()

591788

In [21]:
# 1) Erase tags that are not assigned to a community
df_ans_cluster = df_ans_cluster.dropna().reset_index(drop = True)
df_ans_cluster['community'] = df_ans_cluster['community'].astype(int)

In [22]:
df_ans_cluster

Unnamed: 0,id,creation_date,parent_id,body,community
0,69006468,2021-09-01 00:08:25.013,69006439,"<p>Then, we may need some substring shifting w...",17
1,69006492,2021-09-01 00:12:51.630,69006464,<p>You need to convert your YEAR column to str...,17
2,69006496,2021-09-01 00:13:55.123,69006431,<p>This error is most likely caused by a wrong...,3
3,69006533,2021-09-01 00:22:59.640,69006524,<p>You can do this using flags:</p>\n<pre><cod...,6
4,69006546,2021-09-01 00:25:42.577,69006505,<p>if you are using frontend framework either...,0
...,...,...,...,...,...
2533140,77019850,2023-08-31 23:56:07.187,77013377,<p>Make an inner solution routine that accepts...,17
2533141,77019851,2023-08-31 23:56:19.123,77019825,<p>When you pass <code>stack[100]</code> as an...,17
2533142,77019853,2023-08-31 23:56:47.647,76922631,"<p>I had the same error, I fixed it by followi...",5
2533143,77019855,2023-08-31 23:57:36.797,77019829,"<p>In your Razor component, you can inject the...",1


In [23]:
# Save answers data
#with open(file = 'ans_cluster_pre.pickle', mode = 'wb') as file:
#    pickle.dump(df_ans_cluster, file)

In [None]:
import pickle
with open(file = 'ans_cluster_pre.pickle', mode = 'rb') as file:
    data = pickle.load(file)

In [25]:
df_ans_cluster['body'][0]

"<p>Then, we may need some substring shifting when 'D' occurs as the first character</p>\n<pre><code>x[, paste0('VARNEW_', 1:2) := tstrsplit(sub(&quot;^(D)(.*)&quot;, &quot;\\\\2\\\\1&quot;, \n      VAROLD), &quot;D&quot;, fixed = TRUE)][]\n</code></pre>\n<p>-output</p>\n<pre><code>  VAROLD VARNEW_1 VARNEW_2\n1:    DBA       BA     &lt;NA&gt;\n2:    ADB        A        B\n</code></pre>\n<hr />\n<p>Or if it is only the initial 'D', then remove that before doing the split</p>\n<pre><code>x[, paste0('VARNEW_', 1:2) := tstrsplit(sub(&quot;^D+&quot;, &quot;&quot;, \n       VAROLD), &quot;D&quot;, fixed = TRUE)][]\n   VAROLD VARNEW_1 VARNEW_2\n1:    DBA       BA     &lt;NA&gt;\n2:    ADB        A        B\n</code></pre>\n"