In [None]:
# 1) Assign each user into a cluster.
# - use majoirty of tag usage.
# - use the same cluster for analysis. (assumes user's cluster does not change over time.)
# 2) Iterate each cluster and compute LSM.
# - Filter ith cluster.
# - Compute LSM for each user in the cluster.
    # - owner_user_id, year_month_day, body, lsm_score, community
# - Aggregate average LSM for the cluster.
# 3) Make into a single dataframe.
# - Merge to Techiness/df_final_pre6.csv.

# LSM Calculation for each Community

### 1) Assign each user into a cluster

In [1]:
# 1-2) Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pickle
import os
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, creation_date, owner_user_id, body, parent_id
FROM answers
WHERE creation_date > '2021-09-01'
AND creation_date < '2023-09-01';
'''
df = pd.read_sql_query(query, conn)
conn.close()

In [2]:
# erase NAs
df = df.dropna(subset=['owner_user_id'])
# Convert types
df['owner_user_id'] = df['owner_user_id'].astype('int').astype('str')
df['parent_id'] = df['parent_id'].astype(int).astype(str)
# Add year_month_day variable
df['creation_date'] = pd.to_datetime(df['creation_date'])
df['year_month'] = df['creation_date'].dt.to_period('D')

In [4]:
# 1-2) Get Questions' tag data
# Import Questions data
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, tags
FROM questions
WHERE creation_date > '2020-09-01'
AND creation_date < '2023-09-01';
'''
df_tags = pd.read_sql_query(query, conn)
conn.close()

In [10]:
# 1-3) Merge two data
df_tags['id'] = df_tags['id'].astype(str)
df_merge = pd.merge(df, df_tags, left_on = 'parent_id', right_on = 'id', how = 'left')
df_merge.drop('id_y', axis = 1, inplace = True)
df_merge = df_merge.rename(columns = {'id_x': 'id'})
df_merge

Unnamed: 0,id,creation_date,owner_user_id,body,parent_id,year_month,tags
0,69006420,2021-09-01 00:00:18.070,7332046,"<p>The short approach, remove python3.9 from y...",69006229,2021-09-01,"[""python"", ""pip""]"
1,69006421,2021-09-01 00:00:28.823,14471093,"<p>open the terminal, run as administrator.</p...",68746577,2021-09-01,"["".net"", ""maui"", ""visual-studio-2022""]"
2,69006422,2021-09-01 00:00:35.537,523612,"<p>Conceptually, <code>Base</code> has no reas...",69006293,2021-09-01,"[""python""]"
3,69006429,2021-09-01 00:01:09.140,11269158,<p><code>calldata</code> is a special data loc...,68997666,2021-09-01,"[""solidity""]"
4,69006430,2021-09-01 00:01:17.920,2096113,<p>You could do something like this if the str...,69006320,2021-09-01,"[""php"", ""arrays""]"
...,...,...,...,...,...,...,...
3102831,77019850,2023-08-31 23:56:07.187,313768,<p>Make an inner solution routine that accepts...,77013377,2023-08-31,"[""python"", ""optimization"", ""pulp""]"
3102832,77019851,2023-08-31 23:56:19.123,13296852,<p>When you pass <code>stack[100]</code> as an...,77019825,2023-08-31,"[""arrays"", ""c"", ""data-structures"", ""stack""]"
3102833,77019853,2023-08-31 23:56:47.647,5103949,"<p>I had the same error, I fixed it by followi...",76922631,2023-08-31,"[""android"", ""android-studio""]"
3102834,77019855,2023-08-31 23:57:36.797,16844882,"<p>In your Razor component, you can inject the...",77019829,2023-08-31,"[""blazor"", ""blazor-webassembly""]"


In [11]:
# 1-4) Aggregate tags of each user and assign to majority.
df_merge['tags'] = df_merge['tags'].str.replace('[\["\]]', '', regex=True)
df_merge['tags'] = df_merge['tags'].str.replace('c#','Csharp')
df_merge['tags'] = df_merge['tags'].str.replace('c++','Cpp')
df_merge['tags'] = df_merge['tags'].str.replace('.','DOT')
df_merge['tags'] = df_merge['tags'].str.replace('><',' ')
df_merge['tags'] = df_merge['tags'].str.replace('>','')
df_merge['tags'] = df_merge['tags'].str.replace('-','')
df_merge['tags'] = df_merge['tags'].str.replace('"', '')

new_merge = df_merge.groupby('owner_user_id')[['tags']].agg(lambda x: ', '.join(map(str,x))).reset_index()
new_merge['majority_tag'] = new_merge['tags'].apply(lambda x: x.split(', ')[0] if len(set(x.split(', '))) == 1 else max(set(x.split(', ')), key = x.split(', ').count))
new_merge

Unnamed: 0,owner_user_id,tags,majority_tag
0,100,"linux, ssh, yocto",linux
1,10000051,"sequelizeDOTjs, javascript, sequelizeDOTjs, no...",sequelizeDOTjs
2,10000066,,
3,10000068,"flutter, dart, bloc, flutterbloc",dart
4,1000008,"nan, nan, nan, nan, nan, nan, nan, nan, nan, n...",
...,...,...,...
718754,9999861,"Csharp, emgucv",Csharp
718755,9999898,"go, privatekey, glidegolang, pkcs",pkcs
718756,9999957,"python, visualstudiocode, package, vscodedebugger",visualstudiocode
718757,9999964,"hcl, nomad",hcl


In [12]:
# 1-5) Get pre-computed cluster info and attach
import pickle
with open(file = '/data1/StackOverflow/_Robustness/TagCluster/louvain_community_pre.pickle', mode = 'rb') as file:
    df_clusters = pickle.load(file)
df_clusters

Unnamed: 0,tag,community,tag_count
0,nodeDOTjs,0,82196
1,reactjs,0,155462
2,express,0,18287
3,axios,0,8260
4,refreshtoken,1,369
...,...,...,...
42921,mavenindexer,3,1
42922,irvine16,4,1
42923,aif,1,2
42924,securitystamp,1,2


In [36]:
# merge with df_clusters
df_user_cluster = pd.merge(new_merge, df_clusters, left_on = 'majority_tag', right_on = 'tag', how = 'left')
df_user_cluster = df_user_cluster.drop(['tag', 'tag_count'], axis = 1)

# erase 1462 tags that were not assigned to a specific community. (like langchain)
df_user_cluster = df_user_cluster[~df_user_cluster['community'].isna()]
df_user_cluster['community'] = df_user_cluster['community'].astype(int)
df_user_cluster

Unnamed: 0,owner_user_id,tags,majority_tag,community
0,100,"linux, ssh, yocto",linux,4
1,10000051,"sequelizeDOTjs, javascript, sequelizeDOTjs, no...",sequelizeDOTjs,0
2,10000066,,,17
3,10000068,"flutter, dart, bloc, flutterbloc",dart,5
4,1000008,"nan, nan, nan, nan, nan, nan, nan, nan, nan, n...",,17
...,...,...,...,...
718754,9999861,"Csharp, emgucv",Csharp,1
718755,9999898,"go, privatekey, glidegolang, pkcs",pkcs,7
718756,9999957,"python, visualstudiocode, package, vscodedebugger",visualstudiocode,4
718757,9999964,"hcl, nomad",hcl,7


In [56]:
df_user_cluster.groupby('community').size() # we are not using cluster 2, 12, 16.

community
0     140906
1      48528
2          2
3      36331
4      39591
5      53555
6      42843
7      33927
8       7755
9       4191
10       583
11      2859
12         2
13      4238
14      3630
15       739
16        23
17    297594
dtype: int64

In [51]:
df_user_cluster

Unnamed: 0,owner_user_id,tags,majority_tag,community
0,100,"linux, ssh, yocto",linux,4
1,10000051,"sequelizeDOTjs, javascript, sequelizeDOTjs, no...",sequelizeDOTjs,0
2,10000066,,,17
3,10000068,"flutter, dart, bloc, flutterbloc",dart,5
4,1000008,"nan, nan, nan, nan, nan, nan, nan, nan, nan, n...",,17
...,...,...,...,...
718754,9999861,"Csharp, emgucv",Csharp,1
718755,9999898,"go, privatekey, glidegolang, pkcs",pkcs,7
718756,9999957,"python, visualstudiocode, package, vscodedebugger",visualstudiocode,4
718757,9999964,"hcl, nomad",hcl,7


In [61]:
df_user_cluster = df_user_cluster[(df_user_cluster['community'] != 2) & 
                (df_user_cluster['community'] != 12) & 
                (df_user_cluster['community'] != 16)]

In [63]:
df_user_cluster.to_csv("df_user_cluster.csv")

### 2) Iterate each cluster and compute LSM.

In [126]:
# Import Modules
import sqlite3
import pickle
import os
# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT creation_date, owner_user_id, body
FROM answers
WHERE creation_date >= '2021-09-01' AND creation_date < '2023-09-01';
'''
df = pd.read_sql_query(query, conn)
conn.close()

# erase NAs
df = df.dropna(subset=['owner_user_id'])
df['owner_user_id'] = df['owner_user_id'].astype('int').astype('str')
# Add year_month_day variable
df['creation_date'] = pd.to_datetime(df['creation_date'])
df['year_month'] = df['creation_date'].dt.to_period('D')

In [134]:
df_user_cluster = pd.read_csv("df_user_cluster.csv")
df_user_cluster

Unnamed: 0.1,Unnamed: 0,owner_user_id,tags,majority_tag,community
0,0,100,"linux, ssh, yocto",linux,4
1,1,10000051,"sequelizeDOTjs, javascript, sequelizeDOTjs, no...",sequelizeDOTjs,0
2,2,10000066,,,17
3,3,10000068,"flutter, dart, bloc, flutterbloc",dart,5
4,4,1000008,"nan, nan, nan, nan, nan, nan, nan, nan, nan, n...",,17
...,...,...,...,...,...
717265,718754,9999861,"Csharp, emgucv",Csharp,1
717266,718755,9999898,"go, privatekey, glidegolang, pkcs",pkcs,7
717267,718756,9999957,"python, visualstudiocode, package, vscodedebugger",visualstudiocode,4
717268,718757,9999964,"hcl, nomad",hcl,7


In [128]:
# Aggregate Daily posts by users.
df = df.groupby(['owner_user_id','year_month'])['body'].agg(lambda x: '\n'.join(x)).reset_index()


In [144]:
# Merge with community values
df_user_cluster['owner_user_id'] = df_user_cluster['owner_user_id'].astype(str)
df_merge = pd.merge(df, df_user_cluster[['owner_user_id','community']], on = 'owner_user_id', how = 'left')
df_merge

Unnamed: 0,owner_user_id,year_month,body,community
0,100,2022-02-03,<p>By default Yocto configures SSH servers to ...,4.0
1,10000051,2022-06-24,<p>Try it</p>\n<pre><code>const { literal } = ...,0.0
2,10000051,2022-07-13,<p>Try it</p>\n<pre><code>Product.hasOne(Produ...,0.0
3,10000051,2022-07-14,<p>Try it.</p>\n<pre><code>const { literal } =...,0.0
4,10000066,2022-08-24,<p>If anyone looking to get the region in Node...,17.0
...,...,...,...,...
2240706,9999964,2023-05-11,"<p>One solution would be to use <a href=""https...",7.0
2240707,9999993,2022-10-11,<p><code>{{ __('Dashboard') }}</code> is used ...,0.0
2240708,9999993,2022-11-09,"<p>I had a similar issue, I changed the Method...",0.0
2240709,9999993,2022-11-14,<p>I got the solution with suggestion in the c...,0.0


In [146]:
# erase na values araised while merge
df_merge = df_merge[~df_merge['community'].isna()]
df_merge['community'] = df_merge['community'].astype(int)
df_merge

Unnamed: 0,owner_user_id,year_month,body,community
0,100,2022-02-03,<p>By default Yocto configures SSH servers to ...,4
1,10000051,2022-06-24,<p>Try it</p>\n<pre><code>const { literal } = ...,0
2,10000051,2022-07-13,<p>Try it</p>\n<pre><code>Product.hasOne(Produ...,0
3,10000051,2022-07-14,<p>Try it.</p>\n<pre><code>const { literal } =...,0
4,10000066,2022-08-24,<p>If anyone looking to get the region in Node...,17
...,...,...,...,...
2240706,9999964,2023-05-11,"<p>One solution would be to use <a href=""https...",7
2240707,9999993,2022-10-11,<p><code>{{ __('Dashboard') }}</code> is used ...,0
2240708,9999993,2022-11-09,"<p>I had a similar issue, I changed the Method...",0
2240709,9999993,2022-11-14,<p>I got the solution with suggestion in the c...,0


In [147]:
df_merge = df_merge.sort_values(by = 'year_month')
df_merge

Unnamed: 0,owner_user_id,year_month,body,community
1777899,5961782,2021-09-01,<p>Try the below code maybe to help you. It's ...,6
1601766,4675277,2021-09-01,<p>I was also facing the same issue.</p>\n<p>A...,17
117697,11014221,2021-09-01,<pre><code> &lt;table class=&quot;post-...,9
1374659,3027266,2021-09-01,"<p>Try the <a href=""https://momentjs.com/docs/...",0
1602196,4678336,2021-09-01,<p>i believe you can't do this using CustomAtt...,5
...,...,...,...,...
993281,1919793,2023-08-31,"<p>In addition to <a href=""https://stackoverfl...",17
1098853,20281018,2023-08-31,"<p>To print a message to the console in Java, ...",3
649030,1591921,2023-08-31,<p>A <code>worker_connect</code> event was add...,17
299925,12634230,2023-08-31,"<pre><code>=LET(a, A2:A13,\n b, B2:B13,\n ...",8


In [156]:
dates = df_merge['year_month'].unique().astype(str)

In [171]:
# LSM vr.2 (LSM using Daily Aggregation)
## - no need to consider consistent posting users

# Multiple users Setting
import numpy as np
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

def preprocess_text(text):
    # Remove punctuation and convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

def get_function_word_categories():
    # Define function words categorized by linguistic type
    return dict(
	prepositions = 
		['about',
		'across',
		'against',
		'along ',
		'around',
		'at',
		'behind',
		'beside',
		'besides',
		'by',
		'despite',
		'down',
		'during',
		'for',
		'from',
		'in',
		'inside',
		'into',
		'near',
		'of',
		'off',
		'on',
		'onto',
		'over',
		'through',
		'to',
		'toward',
		'with',
		'within',
		'without'],
	pronouns = 
		['i',
		'y',
		'you',
		'he',
		'me',
		'him',
		'my',
		'mine',
		'her',
		'hers',
		'his',
		'myself',
		'himself',
		'herself',
		'anything',
		'everything',
		'anyone',
		'everyone',
		'ones',
		'such',
		'it',
		'we',
		'they',
		'us',
		'them',
		'our',
		'ours',
		'their',
		'theirs',
		'itself',
		'ourselves',
		'themselves',
		'something',
		'nothing',
		'someone'],
	determiners = 
		['the',
		'some',
		'this',
		'that',
		'every',
		'all',
		'both',
		'one',
		'first',
		'other',
		'next',
		'many',
		'much',
		'more',
		'most',
		'several',
		'a',
		'an',
		'any',
		'each',
		'no',
		'half',
		'twice',
		'two',
		'second',
		'another',
		'last',
		'few',
		'little',
		'less',
		'least',
		'own'],
	conjunctions = 
		['and',
		'but',
		'after',
		'when',
		'as',
		'because',
		'if',
		'what',
		'where',
		'which',
		'how',
		'than',
		'or',
		'so',
		'before',
		'since',
		'while',
		'although',
		'though',
		'who',
		'whose'],
	modal_verbs = 
		['can',
		'may',
		'will',
		'shall',
		'could',
		'might',
		'would',
		'should',
		'must',
        'ought'],
	primary_verbs = 
		['am',
		'is',
        'are',
		'was',
        'were',
		'being',
        'been'
        'be',
		'do',
		'does',
        'did',
		'have',
        'has',
		'had'],
	adverbs = 
		['here',
		'there',
		'today',
		'tomorrow',
		'now',
		'then',
		'always',
		'never',
		'sometimes',
		'usually',
		'often',
		'therefore',
		'however',
		'besides',
		'moreover',
		'though',
		'otherwise',
		'else',
		'instead',
		'anyway',
		'incidentally',
		'meanwhile',
		'hardly'],
)

def calculate_fwu(text, function_words):
    # Preprocess text and calculate total word count
    text = preprocess_text(text)
    total_words = len(text.split())
    
    # Initialize vectorizer and count function words
    vectorizer = CountVectorizer(vocabulary=function_words)
    fwu_counts = vectorizer.fit_transform([text]).toarray().flatten()
    
    # Normalize function word usage by total words
    fwu_normalized = fwu_counts / total_words
    return fwu_normalized

def calculate_community_fwu(texts):
    # Calculate the community-level FWU by averaging FWU across all users' texts
    categories = get_function_word_categories()
    community_fwu = {}
    
    for category, function_words in categories.items():
        category_fwu = np.array([calculate_fwu(text, function_words) for text in texts])
        community_fwu[category] = np.mean(category_fwu, axis=0)  # Average FWU across all texts
    
    return community_fwu

def calculate_lsm_for_all_users(texts):
    # Get categorized function words
    categories = get_function_word_categories()
    
    # Calculate community FWU
    community_fwu = calculate_community_fwu(texts)
    
    # Calculate LSM scores for each user
    lsm_scores_all_users = []
    
    for i, text in enumerate(texts):
        individual_lsm_scores = {}
        for category, function_words in categories.items():
            fwu_individual = calculate_fwu(text, function_words)
            fwu_community = community_fwu[category]
            
            # Avoid division by zero by replacing zeros with a small constant
            fwu_individual = np.where(fwu_individual == 0, 1e-10, fwu_individual)
            fwu_community = np.where(fwu_community == 0, 1e-10, fwu_community)
            
            # Calculate LSM for the current category
            lsm_scores = 1 - (np.abs(fwu_individual - fwu_community) / (fwu_individual + fwu_community))
            individual_lsm_scores[category] = np.mean(lsm_scores)  # Average LSM score for the category
        
        # Calculate the overall LSM score as the mean of all category scores
        overall_lsm_score = np.mean(list(individual_lsm_scores.values()))
        individual_lsm_scores["overall"] = overall_lsm_score
        
        lsm_scores_all_users.append({"user_id": i, **individual_lsm_scores})
    
    return pd.DataFrame(lsm_scores_all_users)

In [None]:
result = pd.DataFrame(columns=['owner_user_id', 'year_month', 'body', 'community', 'lsm_score'])
for date in dates:
    df_filtered = df_merge[df_merge['year_month']==date]
    ls_communities = df_filtered['community'].unique()
    ls_communities.sort()
    for community in ls_communities:
        df_community = df_filtered[df_filtered['community'] == community]
        lsm_scores_df = calculate_lsm_for_all_users(df_community['body'])
        df_community['lsm_score'] = list(lsm_scores_df['overall'])
        result = pd.concat([result, df_community], ignore_index=True)
        print(community)
        print('has been processed')

In [258]:
df_filtered = df_merge[df_merge['year_month']== '2021-09-01']
ls_communities = df_filtered['community'].unique()
ls_communities.sort()
df_community = df_filtered[df_filtered['community'] == 0]
lsm_scores_df = calculate_lsm_for_all_users(df_community['body'])
df_community['lsm_score'] = list(lsm_scores_df['overall'])
df_community

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_community['lsm_score'] = list(lsm_scores_df['overall'])


Unnamed: 0,owner_user_id,year_month,body,community,lsm_score
1374659,3027266,2021-09-01,"<p>Try the <a href=""https://momentjs.com/docs/...",0,0.140352
447643,13981886,2021-09-01,<p>Change to this</p>\n<pre><code>:class=&quot...,0,0.143264
2119294,892629,2021-09-01,<p>This is a rounding error with floating poin...,0,0.347610
1782958,6010118,2021-09-01,<p>I was able to fix this by setting mocha's <...,0,0.177443
1782941,6009931,2021-09-01,<p>Turns out that Burpsuite has a free version...,0,0.136490
...,...,...,...,...,...
152917,11339877,2021-09-01,"<p>For react-natives, you can find this helpfu...",0,0.133524
1460382,3657537,2021-09-01,<p>Because the path <code>url(&quot;./images/i...,0,0.192901
1663189,5122615,2021-09-01,<p><strong>The charts label receives a string ...,0,0.296339
543945,14859456,2021-09-01,<p>I faced the same Issue with NextJS and Mong...,0,0.245706


# Here I have moved to a lsm_community.py file.

In [None]:
# Load Pickle
import pickle
import pandas as pd
import numpy as np

with open(file='LSM_new_community_result.pickle', mode='rb') as f:
    result=pickle.load(f)
average_lsm = result.groupby(['year_month', 'community'])['lsm_score'].mean().reset_index()

In [259]:
# average_lsm = year_month, community, lsm_score
## add to the data below.
pd.read_csv("/data1/StackOverflow/_Robustness/Techiness/df_final_pre6.csv")

Unnamed: 0,year_month_day,T_d,P_t,month,community,techiness,entropy,count_q,count_a,ln_q,...,ln_entropy,year_month,numUser,ln_numUser,comments,CommentsPerP,AnsPerQ,ln_comments,ln_AnsPerQ,ln_CommentsPerP
0,2021-09-01,0,0,9,0,0.525040,7.298398,1213.0,1029.0,7.100852,...,1.987655,2021-09,22317,10.013104,3620,1.614630,1.164056,8.194229,0.151911,0.479106
1,2021-09-01,0,0,9,1,0.412791,7.399966,401.0,225.0,5.993961,...,2.001475,2021-09,7820,8.964440,1297,2.071885,1.032419,7.167809,0.031905,0.728459
2,2021-09-01,0,0,9,3,0.490224,7.231138,394.0,250.0,5.976351,...,1.978396,2021-09,7714,8.950792,1133,1.759317,1.010152,7.032624,0.010101,0.564926
3,2021-09-01,0,0,9,4,0.439161,7.709924,368.0,305.0,5.908083,...,2.042508,2021-09,7720,8.951570,1905,2.830609,1.141304,7.552237,0.132172,1.040492
4,2021-09-01,0,0,9,5,0.461350,7.043218,378.0,258.0,5.934894,...,1.952065,2021-09,7509,8.923858,970,1.525157,1.190476,6.877296,0.174353,0.422098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10625,2023-08-31,1,1,8,10,0.373225,2.947703,3.0,1.0,1.098612,...,1.081026,2023-08,75,4.317488,1,0.250000,0.333333,0.000000,-1.098612,-1.386294
10626,2023-08-31,1,1,8,11,0.476285,4.175869,18.0,12.0,2.890372,...,1.429322,2023-08,343,5.837730,30,1.000000,0.666667,3.401197,-0.405465,0.000000
10627,2023-08-31,1,1,8,13,0.421145,5.528088,29.0,18.0,3.367296,...,1.709842,2023-08,630,6.445720,49,1.042553,0.413793,3.891820,-0.882389,0.041673
10628,2023-08-31,1,1,8,15,0.307775,2.947703,5.0,4.0,1.609438,...,1.081026,2023-08,122,4.804021,7,0.777778,0.600000,1.945910,-0.510826,-0.251314
