# Post Clustering using Pre-computed tag clusters
- 1) Extract tag information from data (2021-09-01 ~ 2023-08-31)
- 2) Load louvein_community.pickle to get cluster information of tags.
- 3) When a post is composed with multiple tags, assign the post into a cluster that has the majority of tags.

In [47]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
from nltk import FreqDist
import pickle
import math
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, creation_date, tags
FROM questions
WHERE creation_date > '2021-09-01' AND
creation_date < '2023-09-01';
'''
df_tags = pd.read_sql_query(query, conn)
conn.close()

In [48]:
df_tags

Unnamed: 0,id,creation_date,tags
0,69006423,2021-09-01 00:00:35.237,"[""node.js"", ""reactjs"", ""express"", ""axios"", ""re..."
1,69006426,2021-09-01 00:00:55.583,"[""r"", ""shiny""]"
2,69006431,2021-09-01 00:01:23.670,"[""java"", ""installation"", ""compilation"", ""proje..."
3,69006437,2021-09-01 00:03:18.750,"[""firebase"", ""nuxt.js"", ""firebase-hosting""]"
4,69006439,2021-09-01 00:03:36.930,"[""r"", ""data.table""]"
...,...,...,...
2715204,77019848,2023-08-31 23:54:30.057,"[""docker"", ""docker-compose"", ""owasp-dependency..."
2715205,77019849,2023-08-31 23:55:21.660,"[""microsoft-graph-api"", ""sharepoint-online""]"
2715206,77019852,2023-08-31 23:56:42.653,"[""python"", ""numpy"", ""floating-point""]"
2715207,77019854,2023-08-31 23:57:28.633,"[""angular"", ""typescript"", ""progressive-web-app..."


In [5]:
import pickle
with open(file = 'louvain_community.pickle', mode = 'rb') as file:
    df_clusters = pickle.load(file)

In [19]:
df_clusters

Unnamed: 0,tag,community,tag_count
0,nodeDOTjs,0,55502
1,reactjs,0,104992
2,express,0,12108
3,axios,0,5682
4,refreshtoken,1,226
...,...,...,...
44054,irvine16,15,1
44055,prologcoroutining,21,1
44056,aif,6,1
44057,securitystamp,11,1


In [49]:
# Preprocess
def wc(text):
    """
    Cleaning function to be used with our first wordcloud
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('-','')
        tags = tags.replace('.','DOT')
        tags = tags.replace('c++','Cpp')
        tags = tags.replace('c#','Csharp')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def clean_tags(text):
    """
    Cleaning function for tags
    """
    
    if text:
        tags = text.replace('><',' ')
        tags = tags.replace('>','')
        return tags.replace('<','')
    else:
        return 'None'
    
def tag_freq(data):
    tags = data['tags'].str.replace('[\["\]]', '', regex=True)
    tags = [tag for i in tags.apply(lambda x: wc(x)) for tag in i.split(', ')]
    result = FreqDist(tags)
    return result
# Tags Preprocessing
df_tags['tags'] = df_tags['tags'].str.replace('[\["\]]', '', regex=True)
df_tags['tags'] = df_tags['tags'].str.replace('c#','Csharp')
df_tags['tags'] = df_tags['tags'].str.replace('c++','Cpp')
df_tags['tags'] = df_tags['tags'].str.replace('.','DOT')
df_tags['tags'] = df_tags['tags'].str.replace('><',' ')
df_tags['tags'] = df_tags['tags'].str.replace('>','')
df_tags['tags'] = df_tags['tags'].str.replace('-','')
df_tags['tags'] = df_tags['tags'].str.replace('"', '')

df_tags = df_tags.reset_index(drop = True)
df_tags['tags'] = df_tags['tags'].str.replace(',', ' ')
df_tags['tags'] = df_tags['tags'].str.split()

In [50]:
df_tags

Unnamed: 0,id,creation_date,tags
0,69006423,2021-09-01 00:00:35.237,"[nodeDOTjs, reactjs, express, axios, refreshto..."
1,69006426,2021-09-01 00:00:55.583,"[r, shiny]"
2,69006431,2021-09-01 00:01:23.670,"[java, installation, compilation, project, ope..."
3,69006437,2021-09-01 00:03:18.750,"[firebase, nuxtDOTjs, firebasehosting]"
4,69006439,2021-09-01 00:03:36.930,"[r, dataDOTtable]"
...,...,...,...
2715204,77019848,2023-08-31 23:54:30.057,"[docker, dockercompose, owaspdependencycheck, ..."
2715205,77019849,2023-08-31 23:55:21.660,"[microsoftgraphapi, sharepointonline]"
2715206,77019852,2023-08-31 23:56:42.653,"[python, numpy, floatingpoint]"
2715207,77019854,2023-08-31 23:57:28.633,"[angular, typescript, progressivewebapps, ngsw..."


In [51]:
# Only leave the first element to compare with the cluster information
df_tags['tag'] = df_tags['tags'].apply(lambda x : x[0])

In [52]:
df_tags

Unnamed: 0,id,creation_date,tags,tag
0,69006423,2021-09-01 00:00:35.237,"[nodeDOTjs, reactjs, express, axios, refreshto...",nodeDOTjs
1,69006426,2021-09-01 00:00:55.583,"[r, shiny]",r
2,69006431,2021-09-01 00:01:23.670,"[java, installation, compilation, project, ope...",java
3,69006437,2021-09-01 00:03:18.750,"[firebase, nuxtDOTjs, firebasehosting]",firebase
4,69006439,2021-09-01 00:03:36.930,"[r, dataDOTtable]",r
...,...,...,...,...
2715204,77019848,2023-08-31 23:54:30.057,"[docker, dockercompose, owaspdependencycheck, ...",docker
2715205,77019849,2023-08-31 23:55:21.660,"[microsoftgraphapi, sharepointonline]",microsoftgraphapi
2715206,77019852,2023-08-31 23:56:42.653,"[python, numpy, floatingpoint]",python
2715207,77019854,2023-08-31 23:57:28.633,"[angular, typescript, progressivewebapps, ngsw...",angular


In [53]:
# merge with df_clusters
df_post_cluster = pd.merge(df_tags, df_clusters, on = 'tag', how = 'left')

In [56]:
# preProcess (erase columns = tag, tag_count / community feature as an integer)
df_post_cluster = df_post_cluster.drop(['tag', 'tag_count'], axis = 1)

In [57]:
df_post_cluster

Unnamed: 0,id,creation_date,tags,community
0,69006423,2021-09-01 00:00:35.237,"[nodeDOTjs, reactjs, express, axios, refreshto...",0.0
1,69006426,2021-09-01 00:00:55.583,"[r, shiny]",21.0
2,69006431,2021-09-01 00:01:23.670,"[java, installation, compilation, project, ope...",1.0
3,69006437,2021-09-01 00:03:18.750,"[firebase, nuxtDOTjs, firebasehosting]",4.0
4,69006439,2021-09-01 00:03:36.930,"[r, dataDOTtable]",21.0
...,...,...,...,...
2715204,77019848,2023-08-31 23:54:30.057,"[docker, dockercompose, owaspdependencycheck, ...",6.0
2715205,77019849,2023-08-31 23:55:21.660,"[microsoftgraphapi, sharepointonline]",6.0
2715206,77019852,2023-08-31 23:56:42.653,"[python, numpy, floatingpoint]",21.0
2715207,77019854,2023-08-31 23:57:28.633,"[angular, typescript, progressivewebapps, ngsw...",0.0


### There are 1092 rows that do not belong to a cluster as they are newly introduced.

In [58]:
df_post_cluster['community'].isna().sum()

1092

In [59]:
df_post_cluster[df_post_cluster.isna().any(axis=1)]

Unnamed: 0,id,creation_date,tags,community
59,69006596,2021-09-01 00:36:13.333,[gf],
5851,69026138,2021-09-02 07:46:52.303,[klipfolio],
20693,69074699,2021-09-06 12:38:06.833,[wordlift],
24975,69088866,2021-09-07 13:11:59.637,[klipfolio],
26543,69094017,2021-09-07 20:07:29.593,[z39DOT50],
...,...,...,...,...
2712147,77013484,2023-08-31 07:00:18.180,[nuxt3],
2712764,77014769,2023-08-31 09:57:56.440,[golfscript],
2712869,77014979,2023-08-31 10:25:35.550,[googleroutesapi],
2713554,77016381,2023-08-31 13:37:49.923,[vectordotdev],


### Data Preparation for Sub-Communities Analysis
- Features in Need from Questions Data (ID, CreationDate, tags, community)
- Features in Need from Answers Data (ID, creation_date, owner_user_id, body, community)

In [64]:
# 1) Erase tags that are not assigned to a community
df_post_cluster = df_post_cluster.dropna().reset_index(drop = True)
df_post_cluster['community'] = df_post_cluster['community'].astype(int)

In [65]:
# Final Version of data.
df_post_cluster

Unnamed: 0,id,creation_date,tags,community
0,69006423,2021-09-01 00:00:35.237,"[nodeDOTjs, reactjs, express, axios, refreshto...",0
1,69006426,2021-09-01 00:00:55.583,"[r, shiny]",21
2,69006431,2021-09-01 00:01:23.670,"[java, installation, compilation, project, ope...",1
3,69006437,2021-09-01 00:03:18.750,"[firebase, nuxtDOTjs, firebasehosting]",4
4,69006439,2021-09-01 00:03:36.930,"[r, dataDOTtable]",21
...,...,...,...,...
2714112,77019848,2023-08-31 23:54:30.057,"[docker, dockercompose, owaspdependencycheck, ...",6
2714113,77019849,2023-08-31 23:55:21.660,"[microsoftgraphapi, sharepointonline]",6
2714114,77019852,2023-08-31 23:56:42.653,"[python, numpy, floatingpoint]",21
2714115,77019854,2023-08-31 23:57:28.633,"[angular, typescript, progressivewebapps, ngsw...",0


In [66]:
# Save data
#with open(file = 'post_cluster_og.pickle', mode = 'wb') as file:
#    pickle.dump(df_post_cluster, file)

In [None]:
import pickle
with open(file = 'post_cluster_og.pickle', mode = 'rb') as file:
    data = pickle.load(file)

In [None]:
# For answers data

### Data Cleansing

In [None]:
# Erase clusters (communities) that have small volume.