# User-Level Clustering by Majority Rule

### 1) Load Data

In [15]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pickle
import os
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, creation_date, owner_user_id, body, parent_id
FROM answers
WHERE creation_date > '2021-09-01'
AND creation_date < '2023-09-01';
'''
df = pd.read_sql_query(query, conn)
conn.close()

In [16]:
# erase NAs
df = df.dropna(subset=['owner_user_id'])
# Convert types
df['owner_user_id'] = df['owner_user_id'].astype(int).astype(str)
df['parent_id'] = df['parent_id'].astype(int).astype(str)
df['creation_date'] = pd.to_datetime(df['creation_date'])

# add year_month variable
df['year_month'] = df['creation_date'].dt.to_period('M')

# Get the total number of unique months
total_months = df['year_month'].nunique()
# Count the number of unique months for each user
user_month_counts = df.groupby('owner_user_id')['year_month'].nunique()
# Filter users who have written posts in every single month
users_in_every_month = user_month_counts[user_month_counts == total_months].index.tolist()

# Filtered data
df_filtered = df[df['owner_user_id'].isin(users_in_every_month)]
df_filtered

Unnamed: 0,id,creation_date,owner_user_id,body,parent_id,year_month
6,69006435,2021-09-01 00:02:37.350,869736,<p>The error is correct. <code>getInputStream...,69006289,2021-09
8,69006438,2021-09-01 00:03:36.527,8690857,<p>You should not use <code>.map</code> for si...,69005880,2021-09
12,69006446,2021-09-01 00:04:33.053,16406,<p>The reason you need line B is called <a hre...,69005820,2021-09
16,69006451,2021-09-01 00:05:09.193,478884,<p>For example you could do it this way:</p>\n...,69006345,2021-09
29,69006475,2021-09-01 00:09:56.233,12957340,<p>Here is another potential solution using ti...,69005901,2021-09
...,...,...,...,...,...,...
3124903,77019800,2023-08-31 23:36:49.467,2156621,<p><code>StreamZip</code> + <code>expand</code...,77005145,2023-08
3124915,77019822,2023-08-31 23:44:48.760,209103,"<p>The <a href=""https://firebase.google.com/do...",77019651,2023-08
3124917,77019826,2023-08-31 23:46:31.700,939860,<p>You don't need a &quot;common key&quot; for...,77019724,2023-08
3124920,77019833,2023-08-31 23:49:08.340,589924,<p>The <code>sprintf</code> operator behaves l...,33897817,2023-08


### 2) Get Questions' tag data

In [17]:
# Import Questions data
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, tags
FROM questions
WHERE creation_date > '2020-09-01'
AND creation_date < '2023-09-01';
'''
df_tags = pd.read_sql_query(query, conn)
conn.close()

In [18]:
# Test Case
df_tags[df_tags['id'] == 69006289]

Unnamed: 0,id,tags
1629997,69006289,"[""java"", ""eclipse"", ""memory-leaks"", ""io""]"


In [19]:
df_tags['id'] = df_tags['id'].astype(str)
df_merge = pd.merge(df_filtered, df_tags, left_on = 'parent_id', right_on = 'id', how = 'left')
df_merge.drop('id_y', axis = 1, inplace = True)
df_merge = df_merge.rename(columns = {'id_x': 'id'})
df_merge

Unnamed: 0,id,creation_date,owner_user_id,body,parent_id,year_month,tags
0,69006435,2021-09-01 00:02:37.350,869736,<p>The error is correct. <code>getInputStream...,69006289,2021-09,"[""java"", ""eclipse"", ""memory-leaks"", ""io""]"
1,69006438,2021-09-01 00:03:36.527,8690857,<p>You should not use <code>.map</code> for si...,69005880,2021-09,"[""javascript"", ""arrays"", ""reactjs"", ""map-funct..."
2,69006446,2021-09-01 00:04:33.053,16406,<p>The reason you need line B is called <a hre...,69005820,2021-09,"[""c++"", ""class"", ""pointers"", ""inheritance"", ""r..."
3,69006451,2021-09-01 00:05:09.193,478884,<p>For example you could do it this way:</p>\n...,69006345,2021-09,"[""excel"", ""vba"", ""parsing"", ""offset""]"
4,69006475,2021-09-01 00:09:56.233,12957340,<p>Here is another potential solution using ti...,69005901,2021-09,"[""r""]"
...,...,...,...,...,...,...,...
437024,77019800,2023-08-31 23:36:49.467,2156621,<p><code>StreamZip</code> + <code>expand</code...,77005145,2023-08,"[""dart""]"
437025,77019822,2023-08-31 23:44:48.760,209103,"<p>The <a href=""https://firebase.google.com/do...",77019651,2023-08,"[""javascript"", ""firebase"", ""next.js"", ""server-..."
437026,77019826,2023-08-31 23:46:31.700,939860,<p>You don't need a &quot;common key&quot; for...,77019724,2023-08,"[""sql"", ""postgresql"", ""join""]"
437027,77019833,2023-08-31 23:49:08.340,589924,<p>The <code>sprintf</code> operator behaves l...,33897817,2023-08,


### 3) Aggregate tags of each user.

In [20]:
df_merge['tags'] = df_merge['tags'].str.replace('[\["\]]', '', regex=True)
df_merge['tags'] = df_merge['tags'].str.replace('c#','Csharp')
df_merge['tags'] = df_merge['tags'].str.replace('c++','Cpp')
df_merge['tags'] = df_merge['tags'].str.replace('.','DOT')
df_merge['tags'] = df_merge['tags'].str.replace('><',' ')
df_merge['tags'] = df_merge['tags'].str.replace('>','')
df_merge['tags'] = df_merge['tags'].str.replace('-','')
df_merge['tags'] = df_merge['tags'].str.replace('"', '')

In [21]:
# 3-1 Complete
new_merge = df_merge.groupby('owner_user_id')[['tags']].agg(lambda x: ', '.join(map(str,x))).reset_index()
new_merge

Unnamed: 0,owner_user_id,tags
0,10008173,"docker, filesystems, docker, express, dockerfi..."
1,10112124,"angular, authentication, rxjs, ngrx, ngrxentit..."
2,10138734,"mysql, rubyonrails, ruby, railsmigrations, mys..."
3,10140124,"elasticsearch, javascript, ifstatement, boolea..."
4,10157127,"flutter, flutterlayout, flutterdependencies, f..."
...,...,...
790,9952196,"sqlite, linux, bash, shell, sed, linux, bash, ..."
791,9957710,"arrays, struct, julia, arrays, julia, multithr..."
792,99692,"rust, rust, enums, io, rust, rust, lifetime, o..."
793,9971759,"python, ortools, python, constraints, nonlinea..."


### 4) Find the majority of tag

In [35]:
new_merge['majority_tag'] = new_merge['tags'].apply(lambda x: x.split(', ')[0] if len(set(x.split(', '))) == 1 else max(set(x.split(', ')), key = x.split(', ').count))
new_merge

Unnamed: 0,owner_user_id,tags,majority_tag
0,10008173,"docker, filesystems, docker, express, dockerfi...",docker
1,10112124,"angular, authentication, rxjs, ngrx, ngrxentit...",ngrx
2,10138734,"mysql, rubyonrails, ruby, railsmigrations, mys...",mysql
3,10140124,"elasticsearch, javascript, ifstatement, boolea...",javascript
4,10157127,"flutter, flutterlayout, flutterdependencies, f...",flutter
...,...,...,...
790,9952196,"sqlite, linux, bash, shell, sed, linux, bash, ...",bash
791,9957710,"arrays, struct, julia, arrays, julia, multithr...",julia
792,99692,"rust, rust, enums, io, rust, rust, lifetime, o...",rust
793,9971759,"python, ortools, python, constraints, nonlinea...",ortools


### 5) Get pre-computed cluster info and attach.

In [38]:
import pickle
with open(file = '/data1/StackOverflow/_Robustness/TagCluster/louvain_community_pre.pickle', mode = 'rb') as file:
    df_clusters = pickle.load(file)

In [39]:
df_clusters

Unnamed: 0,tag,community,tag_count
0,nodeDOTjs,0,82196
1,reactjs,0,155462
2,express,0,18287
3,axios,0,8260
4,refreshtoken,1,369
...,...,...,...
42921,mavenindexer,3,1
42922,irvine16,4,1
42923,aif,1,2
42924,securitystamp,1,2


In [41]:
# merge with df_clusters
df_post_cluster = pd.merge(new_merge, df_clusters, left_on = 'majority_tag', right_on = 'tag', how = 'left')
df_post_cluster

Unnamed: 0,owner_user_id,tags,majority_tag,tag,community,tag_count
0,10008173,"docker, filesystems, docker, express, dockerfi...",docker,docker,7,35204
1,10112124,"angular, authentication, rxjs, ngrx, ngrxentit...",ngrx,ngrx,0,818
2,10138734,"mysql, rubyonrails, ruby, railsmigrations, mys...",mysql,mysql,6,39190
3,10140124,"elasticsearch, javascript, ifstatement, boolea...",javascript,javascript,0,287722
4,10157127,"flutter, flutterlayout, flutterdependencies, f...",flutter,flutter,5,76904
...,...,...,...,...,...,...
790,9952196,"sqlite, linux, bash, shell, sed, linux, bash, ...",bash,bash,4,17308
791,9957710,"arrays, struct, julia, arrays, julia, multithr...",julia,julia,17,3045
792,99692,"rust, rust, enums, io, rust, rust, lifetime, o...",rust,rust,4,15083
793,9971759,"python, ortools, python, constraints, nonlinea...",ortools,ortools,17,386


In [42]:
# preProcess (erase columns = tag, tag_count / community feature as an integer)
df_post_cluster = df_post_cluster.drop(['tag', 'tag_count'], axis = 1)
df_post_cluster

Unnamed: 0,owner_user_id,tags,majority_tag,community
0,10008173,"docker, filesystems, docker, express, dockerfi...",docker,7
1,10112124,"angular, authentication, rxjs, ngrx, ngrxentit...",ngrx,0
2,10138734,"mysql, rubyonrails, ruby, railsmigrations, mys...",mysql,6
3,10140124,"elasticsearch, javascript, ifstatement, boolea...",javascript,0
4,10157127,"flutter, flutterlayout, flutterdependencies, f...",flutter,5
...,...,...,...,...
790,9952196,"sqlite, linux, bash, shell, sed, linux, bash, ...",bash,4
791,9957710,"arrays, struct, julia, arrays, julia, multithr...",julia,17
792,99692,"rust, rust, enums, io, rust, rust, lifetime, o...",rust,4
793,9971759,"python, ortools, python, constraints, nonlinea...",ortools,17


In [43]:
df_post_cluster.groupby('community').size() # we are not using cluster 2, 12, 16.

community
0     105
1      80
3      66
4     131
5      50
6      88
7      42
8      27
9       2
10      3
11      5
13     15
14      2
15      3
16      1
17    175
dtype: int64

### 6) LSM

In [59]:
df_analysis = df_merge.groupby(['owner_user_id', 'year_month'])['body'].agg(lambda x: ', '.join(map(str,x))).reset_index()
df_analysis = pd.merge(df_analysis, df_post_cluster.drop(['tags', 'majority_tag'], axis = 1), on = 'owner_user_id', how = 'left')
# remove 16th community
df_analysis = df_analysis[df_analysis['community'] != 16]
df_analysis

Unnamed: 0,owner_user_id,year_month,body,community
0,10008173,2021-09,<p>Since you already know the location on the ...,7
1,10008173,2021-10,<p>That <code>ENTRYPOINT</code> line doesn't f...,7
2,10008173,2021-11,<p>A Docker container only runs one process. ...,7
3,10008173,2021-12,<p>Connections between containers (over the Do...,7
4,10008173,2022-01,<p>You can use the Docker image name (<code>do...,7
...,...,...,...,...
19075,997358,2023-04,<p>To avoid various inefficiencies in your pro...,4
19076,997358,2023-05,<p>If you want an expression which is guarante...,4
19077,997358,2023-06,<p>How to optimize? Since you don't seem to h...,4
19078,997358,2023-07,"<p>Since the <a href=""/questions/tagged/jq"" cl...",4


In [62]:
# size of community = # of users in a cluster x 24
df_analysis.groupby('community').size()

community
0     2520
1     1920
3     1584
4     3144
5     1200
6     2112
7     1008
8      648
9       48
10      72
11     120
13     360
14      48
15      72
17    4200
dtype: int64

In [67]:
# save df_analysis pickle
#with open(f'df_analysis_final.pickle', 'wb') as fw:
#    pickle.dump(df_analysis, fw)

In [68]:
# Save individual body data into separate md file.
year_month = list(df_analysis['year_month'].unique().astype('str')) # get unique year_month
for i in range(len(year_month)):
    output_directory = f'/data1/StackOverflow/_Robustness/UserCluster/data_final/{year_month[i]}'
    os.makedirs(output_directory, exist_ok=True)
    data = df_analysis[df_analysis['year_month'] == year_month[i]]
    for index, row in data.iterrows():
        user_filename = f"{output_directory}/user_{row['owner_user_id']}.md"
        with open(user_filename, 'w') as md_file:
            md_file.write(f"## User {row['owner_user_id']}\n")
            md_file.write(row['body'] + '\n\n')
    print(f"Data has been saved to individual md files in the '{output_directory}' directory.")

Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/UserCluster/data_final/2021-09' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/UserCluster/data_final/2021-10' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/UserCluster/data_final/2021-11' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/UserCluster/data_final/2021-12' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/UserCluster/data_final/2022-01' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/UserCluster/data_final/2022-02' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/UserCluster/data_final/2022-03' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/UserCluster/data_final/2022

### HERE, use separate py file for lsm Computation
- compute.py
- LSM Calculation for each user
- Aggregater Average LSM score for each cluster monthly

In [5]:
# LSM Score Aggregation
import os
import pandas as pd
import pickle
folder_path = '/data1/StackOverflow/_Robustness/UserCluster/result_pickle_final'
# Get a list of all files in the folder
file_list = os.listdir(folder_path)
file_list = sorted(file_list)
# Filter only pickle files
pickle_files = [file for file in file_list if file.endswith('.pickle')]


In [6]:
with open('df_analysis_final.pickle', 'rb') as fw:
    df_analysis = pickle.load(fw)
print("Number of observation: ", len(df_analysis))
print("NumUser x Num year_months: ", df_analysis.owner_user_id.nunique()*df_analysis.year_month.nunique())

Number of observation:  19056
NumUser x Num year_months:  19056


In [7]:
# LSM Score Aggregation
import os
import pandas as pd
folder_path = '/data1/StackOverflow/_Robustness/UserCluster/result_pickle_final'
# Get a list of all files in the folder
file_list = os.listdir(folder_path)
file_list = sorted(file_list)
# Filter only pickle files
pickle_files = [file for file in file_list if file.endswith('.pickle')]

# Iterate through each pickle file and load it into a separate object
loaded_objects = pd.DataFrame(columns = ['User1', 'year_month', 'Similarity_toAvg'])
for pickle_file in pickle_files:
    file_path = os.path.join(folder_path, pickle_file)
    file_name = os.path.splitext(pickle_file)[0]
    # AGGREGATION
    with open(file_path, 'rb') as f:
        loaded_object = pickle.load(f)
    loaded_object['Similarity_toAvg'] = loaded_object['Similarity_toAvg'].astype(float)
    loaded_object = loaded_object.groupby('User1')['Similarity_toAvg'].mean().reset_index()
    loaded_object['year_month'] = file_name # Add year_month
    loaded_object = loaded_object[['User1', 'year_month', 'Similarity_toAvg']]
    # APPEND
    loaded_objects = pd.concat([loaded_objects, loaded_object])

  loaded_objects = pd.concat([loaded_objects, loaded_object])


In [8]:
loaded_objects

Unnamed: 0,User1,year_month,Similarity_toAvg
0,user_10008173.md,2021-09,0.180213
1,user_10112124.md,2021-09,0.189607
2,user_10138734.md,2021-09,0.175917
3,user_10140124.md,2021-09,0.169897
4,user_10157127.md,2021-09,0.180124
...,...,...,...
788,user_9952196.md,2023-08,0.170478
789,user_9957710.md,2023-08,0.167816
790,user_99692.md,2023-08,0.172995
791,user_9971759.md,2023-08,0.170216


In [57]:
# Every user has 24 observations (correct) -> 793 users x  24(year_months) = 29032
loaded_objects.groupby('User1').count().reset_index()

Unnamed: 0,User1,year_month,Similarity_toAvg
0,user_10008173.md,24,24
1,user_10112124.md,24,24
2,user_10138734.md,24,24
3,user_10140124.md,24,24
4,user_10157127.md,24,24
...,...,...,...
788,user_9952196.md,24,24
789,user_9957710.md,24,24
790,user_99692.md,24,24
791,user_9971759.md,24,24


### For LSM OG Version

In [61]:
loaded_objects = loaded_objects.rename(columns = {"User1": "owner_user_id"})
loaded_objects['owner_user_id'] = loaded_objects['owner_user_id'].apply(lambda x: x.replace('.md', ''))
loaded_objects['owner_user_id'] = loaded_objects['owner_user_id'].apply(lambda x: x.replace('user_', ''))

In [64]:
loaded_objects

Unnamed: 0,owner_user_id,year_month,Similarity_toAvg
0,10008173,2021-09,0.180213
1,10112124,2021-09,0.189607
2,10138734,2021-09,0.175917
3,10140124,2021-09,0.169897
4,10157127,2021-09,0.180124
...,...,...,...
788,9952196,2023-08,0.170478
789,9957710,2023-08,0.167816
790,99692,2023-08,0.172995
791,9971759,2023-08,0.170216


In [None]:
# Add T_d (Treated: 2022-09 ~ 2023-08 (12 months) Cntrol: 2021-09 ~ 2022-08 (12 months) )
import numpy as np
T_d = np.zeros(len(loaded_objects), dtype = int)
T_d[9516:] = 1
loaded_objects['T_d'] = T_d

In [None]:
# Make month variable
loaded_objects['month'] = loaded_objects['year_month'].str[5:].astype(int).astype(str)
# Add P_t (if  9, 10, 11 month == P_t == 0, the others P_t == 1)
loaded_objects['P_t'] = loaded_objects['month'].apply(lambda x: 0 if x in ['9', '10', '11'] else 1)

In [110]:
loaded_objects # change colname (owner_user_id -> user)

Unnamed: 0,owner_user_id,year_month,Similarity_toAvg,T_d,month,P_t
0,10008173,2021-09,0.180213,0,9,0
1,10112124,2021-09,0.189607,0,9,0
2,10138734,2021-09,0.175917,0,9,0
3,10140124,2021-09,0.169897,0,9,0
4,10157127,2021-09,0.180124,0,9,0
...,...,...,...,...,...,...
788,9952196,2023-08,0.170478,1,8,1
789,9957710,2023-08,0.167816,1,8,1
790,99692,2023-08,0.172995,1,8,1
791,9971759,2023-08,0.170216,1,8,1


In [116]:
loaded_objects = loaded_objects.rename(columns={'Similarity_toAvg': 'LSM'})
loaded_objects['ln_LSM'] = np.log(loaded_objects['LSM'])
loaded_objects

Unnamed: 0,owner_user_id,year_month,LSM,T_d,month,P_t,ln_LSM
0,10008173,2021-09,0.180213,0,9,0,-1.713614
1,10112124,2021-09,0.189607,0,9,0,-1.662801
2,10138734,2021-09,0.175917,0,9,0,-1.737744
3,10140124,2021-09,0.169897,0,9,0,-1.772566
4,10157127,2021-09,0.180124,0,9,0,-1.714112
...,...,...,...,...,...,...,...
788,9952196,2023-08,0.170478,1,8,1,-1.769151
789,9957710,2023-08,0.167816,1,8,1,-1.784889
790,99692,2023-08,0.172995,1,8,1,-1.754492
791,9971759,2023-08,0.170216,1,8,1,-1.770688


### For LSM Cluster Version
- add Clusters
- add Techiness

In [134]:
df_analysis['year_month'] = df_analysis['year_month'].astype(str)
df_final = pd.merge(loaded_objects, df_analysis[['owner_user_id', 'year_month', 'community']], on = ["owner_user_id", 'year_month'], how = 'left')
df_final

Unnamed: 0,owner_user_id,year_month,LSM,T_d,month,P_t,ln_LSM,community
0,10008173,2021-09,0.180213,0,9,0,-1.713614,7
1,10112124,2021-09,0.189607,0,9,0,-1.662801,0
2,10138734,2021-09,0.175917,0,9,0,-1.737744,6
3,10140124,2021-09,0.169897,0,9,0,-1.772566,0
4,10157127,2021-09,0.180124,0,9,0,-1.714112,5
...,...,...,...,...,...,...,...,...
19027,9952196,2023-08,0.170478,1,8,1,-1.769151,4
19028,9957710,2023-08,0.167816,1,8,1,-1.784889,17
19029,99692,2023-08,0.172995,1,8,1,-1.754492,4
19030,9971759,2023-08,0.170216,1,8,1,-1.770688,17


In [148]:
# Aggregate Monthly (24*15 = 360)
df_cluster = df_final.groupby(['year_month', 'community'])['LSM'].mean().reset_index()
df_cluster

Unnamed: 0,year_month,community,LSM
0,2021-09,0,0.187376
1,2021-09,1,0.185023
2,2021-09,3,0.185464
3,2021-09,4,0.182246
4,2021-09,5,0.187612
...,...,...,...
355,2023-08,11,0.178366
356,2023-08,13,0.195913
357,2023-08,14,0.182867
358,2023-08,15,0.186984


In [159]:
# Add T_d
T_d = np.zeros(len(df_cluster), dtype = int)
T_d[180:] = 1
df_cluster['T_d'] = T_d

In [162]:
# Add P_t
# Make month variable
df_cluster['month'] = df_cluster['year_month'].str[5:].astype(int).astype(str)
# Add P_t (if  9, 10, 11 month == P_t == 0, the others P_t == 1)
df_cluster['P_t'] = df_cluster['month'].apply(lambda x: 0 if x in ['9', '10', '11'] else 1)

In [179]:
df_cluster

Unnamed: 0,year_month,community,LSM,T_d,month,P_t
0,2021-09,0,0.187376,0,9,0
1,2021-09,1,0.185023,0,9,0
2,2021-09,3,0.185464,0,9,0
3,2021-09,4,0.182246,0,9,0
4,2021-09,5,0.187612,0,9,0
...,...,...,...,...,...,...
355,2023-08,11,0.178366,1,8,1
356,2023-08,13,0.195913,1,8,1
357,2023-08,14,0.182867,1,8,1
358,2023-08,15,0.186984,1,8,1


In [183]:
# Get techiness
df_final_pre4 = pd.read_csv("/data1/StackOverflow/_Robustness/Techiness/df_final_pre4.csv")
techiness = df_final_pre4.groupby('community')['techiness'].mean().reset_index()
df_cluster = pd.merge(df_cluster, techiness, on = 'community', how = 'left')
df_cluster

Unnamed: 0,year_month,community,LSM,T_d,month,P_t,techiness
0,2021-09,0,0.187376,0,9,0,0.525040
1,2021-09,1,0.185023,0,9,0,0.412791
2,2021-09,3,0.185464,0,9,0,0.490224
3,2021-09,4,0.182246,0,9,0,0.439161
4,2021-09,5,0.187612,0,9,0,0.461350
...,...,...,...,...,...,...,...
355,2023-08,11,0.178366,1,8,1,0.476285
356,2023-08,13,0.195913,1,8,1,0.421145
357,2023-08,14,0.182867,1,8,1,0.403302
358,2023-08,15,0.186984,1,8,1,0.307775


In [184]:
# ln(LSM)
df_cluster['ln_LSM'] = np.log(df_cluster['LSM'])
df_cluster = df_cluster[['year_month', 'month', 'community', 'techiness', 'T_d', 'P_t', 'LSM', 'ln_LSM']]

In [3]:
df_cluster

Unnamed: 0,year_month,month,community,techiness,T_d,P_t,LSM,ln_LSM
0,2021-09,9,0,0.525040,0,0,0.187376,-1.674640
1,2021-09,9,1,0.412791,0,0,0.185023,-1.687277
2,2021-09,9,3,0.490224,0,0,0.185464,-1.684893
3,2021-09,9,4,0.439161,0,0,0.182246,-1.702397
4,2021-09,9,5,0.461350,0,0,0.187612,-1.673382
...,...,...,...,...,...,...,...,...
355,2023-08,8,11,0.476285,1,1,0.178366,-1.723919
356,2023-08,8,13,0.421145,1,1,0.195913,-1.630082
357,2023-08,8,14,0.403302,1,1,0.182867,-1.698997
358,2023-08,8,15,0.307775,1,1,0.186984,-1.676732


In [189]:
# Save
    # 1) Original
#loaded_objects.to_csv('lsm_og.csv', index=False)
    # 2) Triple Difference Version
#df_cluster.to_csv('lsm_cluster.csv', index = False)

### Add Number of Unique Users in each month by each cluster

In [10]:
# df_final_pre4.csv has the 'monthly' numUser value calculated by each cluster.
import pandas as pd
data = pd.read_csv('/data1/StackOverflow/_Robustness/Techiness/df_final_pre4.csv')
data

Unnamed: 0,year_month_day,T_d,P_t,month,community,techiness,entropy,count_q,count_a,ln_q,ln_a,ln_entropy,year_month,numUser,ln_numUser
0,2021-09-01,0,0,9,0,0.525040,7.298398,1213.0,1029.0,7.100852,6.936343,1.987655,2021-09,22317,10.013104
1,2021-09-01,0,0,9,1,0.412791,7.399966,401.0,225.0,5.993961,5.416100,2.001475,2021-09,7820,8.964440
2,2021-09-01,0,0,9,3,0.490224,7.231138,394.0,250.0,5.976351,5.521461,1.978396,2021-09,7714,8.950792
3,2021-09-01,0,0,9,4,0.439161,7.709924,368.0,305.0,5.908083,5.720312,2.042508,2021-09,7720,8.951570
4,2021-09-01,0,0,9,5,0.461350,7.043218,378.0,258.0,5.934894,5.552960,1.952065,2021-09,7509,8.923858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10707,2023-08-31,1,1,8,11,0.476285,4.175869,18.0,12.0,2.890372,2.484907,1.429322,2023-08,343,5.837730
10708,2023-08-31,1,1,8,13,0.421145,5.528088,29.0,18.0,3.367296,2.890372,1.709842,2023-08,630,6.445720
10709,2023-08-31,1,1,8,14,0.403302,3.344698,8.0,1.0,2.079442,0.000000,1.207377,2023-08,278,5.627621
10710,2023-08-31,1,1,8,15,0.307775,2.947703,5.0,4.0,1.609438,1.386294,1.081026,2023-08,122,4.804021


In [24]:
numUser = data.groupby(['year_month','community'])['numUser'].mean().reset_index()
numUser['numUser'] = numUser['numUser'].astype(int)
numUser

Unnamed: 0,year_month,community,numUser
0,2021-09,0,22317
1,2021-09,1,7820
2,2021-09,3,7714
3,2021-09,4,7720
4,2021-09,5,7509
...,...,...,...
355,2023-08,11,343
356,2023-08,13,630
357,2023-08,14,278
358,2023-08,15,122


In [31]:
lsm_cluster_numUser = pd.read_csv("lsm_cluster.csv")
import numpy as np
lsm_cluster_numUser = pd.merge(lsm_cluster_numUser, numUser, on = ['year_month', 'community'], how = 'left')
lsm_cluster_numUser['ln_numUser'] = np.log(lsm_cluster_numUser['numUser'])
lsm_cluster_numUser

Unnamed: 0,year_month,month,community,techiness,T_d,P_t,LSM,ln_LSM,numUser,ln_numUser
0,2021-09,9,0,0.525040,0,0,0.187376,-1.674640,22317,10.013104
1,2021-09,9,1,0.412791,0,0,0.185023,-1.687277,7820,8.964440
2,2021-09,9,3,0.490224,0,0,0.185464,-1.684893,7714,8.950792
3,2021-09,9,4,0.439161,0,0,0.182246,-1.702397,7720,8.951570
4,2021-09,9,5,0.461350,0,0,0.187612,-1.673382,7509,8.923858
...,...,...,...,...,...,...,...,...,...,...
355,2023-08,8,11,0.476285,1,1,0.178366,-1.723919,343,5.837730
356,2023-08,8,13,0.421145,1,1,0.195913,-1.630082,630,6.445720
357,2023-08,8,14,0.403302,1,1,0.182867,-1.698997,278,5.627621
358,2023-08,8,15,0.307775,1,1,0.186984,-1.676732,122,4.804021


In [33]:
# Save Data
lsm_cluster_numUser.to_csv('lsm_cluster_numUser.csv', index=False)