### Similarity Measure using LSM (English)

In [77]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pickle
import os
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/_Robustness/English/stack.db')
query = '''
SELECT creation_date, owner_user_id, body
FROM answers
WHERE creation_date >= '2021-09-01'
AND creation_date < '2023-10-01';
'''
df_tags = pd.read_sql_query(query, conn)
conn.close()

In [78]:
# erase NAs
df_tags = df_tags.dropna(subset=['owner_user_id'])
# Convert types
df_tags['owner_user_id'] = df_tags['owner_user_id'].astype(int).astype(str)
df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
# add year_month variable
df_tags['year_month'] = df_tags['creation_date'].dt.to_period('M')

# Get the total number of unique months
total_months = df_tags['year_month'].nunique()
# Count the number of unique months for each user
user_month_counts = df_tags.groupby('owner_user_id')['year_month'].nunique()
# Filter users who have written posts in every single month
users_in_every_month = user_month_counts[user_month_counts == total_months].index.tolist()

# Filtered data
df_filtered = df_tags[df_tags['owner_user_id'].isin(users_in_every_month)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['owner_user_id'] = df_tags['owner_user_id'].astype(int).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['year_month'] = df_tags['creation_date'].dt.to_period

In [79]:
df_filtered

Unnamed: 0,creation_date,owner_user_id,body,year_month
4,2021-09-01 08:29:21.340,159691,<p>The origin is still unclear as suggested b...,2021-09
11,2021-09-01 13:50:26.877,90320,<p>Your second sentence is grammatically corre...,2021-09
13,2021-09-01 15:33:17.463,18696,<p><em>Dear</em> here is related to <em>dearth...,2021-09
14,2021-09-01 16:07:02.423,21655,<p>There's a 'backwards particularising' here....,2021-09
20,2021-09-01 20:47:35.080,36232,<p>I consulted a number of print dictionaries ...,2021-09
...,...,...,...,...
16937,2023-09-29 07:43:27.777,371337,<p>This is best described an example of a comp...,2023-09
16957,2023-09-30 06:58:20.417,141248,<blockquote>\n<p><em>The probability</em> [<em...,2023-09
16959,2023-09-30 08:29:18.907,36232,<p><em>Merriam-Webster's Eleventh Collegiate D...,2023-09
16961,2023-09-30 10:54:47.490,90320,<p>There isn't really a <em>verb</em> for doin...,2023-09


In [107]:
len(df_filtered.owner_user_id.unique())

14

In [81]:
# Monthly aggregate for each user
df_grouped = df_filtered.groupby(['owner_user_id','year_month'])['body'].agg(lambda x: '\n'.join(x)).reset_index()
year_month = list(df_grouped['year_month'].unique().astype('str')) # get unique year_month
# Save individual body data into separate md file.
for i in range(len(year_month)):
    output_directory = f'/data1/StackOverflow/_Robustness/English/lsm_md/{year_month[i]}'
    os.makedirs(output_directory, exist_ok=True)
    data = df_grouped[df_grouped['year_month'] == year_month[i]]
    for index, row in data.iterrows():
        user_filename = f"{output_directory}/user_{row['owner_user_id']}.md"
        with open(user_filename, 'w') as md_file:
            md_file.write(f"## User {row['owner_user_id']}\n")
            md_file.write(row['body'] + '\n\n')
    print(f"Data has been saved to individual md files in the '{output_directory}' directory.")

Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2021-09' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2021-10' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2021-11' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2021-12' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2022-01' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2022-02' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2022-03' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2022-04' directory.
Data has been saved to individual md files in th

### 2) LSM Score Calculation

In [82]:
import itertools
import sys
# Append the directory containing the LSM module to the Python path
sys.path.append('/data1/StackOverflow/language-style-matching-python')
from lib import LSM

In [83]:
# (Bottleneck)
# Load files in the folder.
for i in range(len(year_month)):
	for root, dirs, files in os.walk(f'/data1/StackOverflow/_Robustness/English/lsm_md/{year_month[i]}'): # parameter (change 'Final')
		folks = {}
		for folk in files:
			with open(os.path.join(root, folk), 'r') as f:
				folks[folk] = LSM(f.read()) # load via LSM class.

	combos = itertools.combinations(folks.items(), 2) 
	compares = []
	everybody = sum(folks.values())
	for obj1, obj2 in [combo for combo in combos]:
		compares.append([obj1[0], #obj2[0], 
			#str(obj1[1].compare(obj2[1])), # compare one-to-one.
			str(obj1[1].compare(everybody))]) # compare one-to-avg.


	col = ['User1', 'Similarity_toAvg']
	df = pd.DataFrame(compares, columns = col)
	print(f"{i} out of {len(year_month)} has been processed")

	# save via pickle
	with open(f'/data1/StackOverflow/_Robustness/English/lsm_md/{year_month[i]}.pickle', 'wb') as fw: # parameter (change 'Final_pickle')
		pickle.dump(df, fw)

0 out of 25 has been processed
1 out of 25 has been processed
2 out of 25 has been processed
3 out of 25 has been processed
4 out of 25 has been processed
5 out of 25 has been processed
6 out of 25 has been processed
7 out of 25 has been processed
8 out of 25 has been processed
9 out of 25 has been processed
10 out of 25 has been processed
11 out of 25 has been processed
12 out of 25 has been processed
13 out of 25 has been processed
14 out of 25 has been processed
15 out of 25 has been processed
16 out of 25 has been processed
17 out of 25 has been processed
18 out of 25 has been processed
19 out of 25 has been processed
20 out of 25 has been processed
21 out of 25 has been processed
22 out of 25 has been processed
23 out of 25 has been processed
24 out of 25 has been processed


In [176]:
folder_path = '/data1/StackOverflow/_Robustness/English/lsm_md'

# Get a list of all files in the folder
file_list = os.listdir(folder_path)
file_list = sorted(file_list)

# Filter only pickle files
pickle_files = [file for file in file_list if file.endswith('.pickle')]

# Iterate through each pickle file and load it into a separate object
loaded_objects = []
for pickle_file in pickle_files:
    file_path = os.path.join(folder_path, pickle_file)
    file_name = os.path.splitext(pickle_file)[0]
    
    with open(file_path, 'rb') as f:
        loaded_object = pickle.load(f)
        #selected_indices = list(range(0, len(loaded_object), 14)) # indices of every 167 rows.
        selected_indices = [0, 13, 25, 36, 46, 55, 63, 70, 76, 81, 85, 88, 90]
        loaded_object = loaded_object.iloc[selected_indices]
        loaded_object['year_month'] = file_name # add year_month value.
        loaded_objects.append(loaded_object)

In [149]:
loaded_object # [13, 25, 36, 46, 55, 63, 70, 76, 81, 85, 88, 90, 91]

Unnamed: 0,User1,Similarity_toAvg
0,user_159691.md,0.2031041829186236
1,user_159691.md,0.2031041829186236
2,user_159691.md,0.2031041829186236
3,user_159691.md,0.2031041829186236
4,user_159691.md,0.2031041829186236
...,...,...
86,user_371337.md,0.18529895042259925
87,user_371337.md,0.18529895042259925
88,user_299611.md,0.19923549038342736
89,user_299611.md,0.19923549038342736


In [177]:
loaded_objects[0]

Unnamed: 0,User1,Similarity_toAvg,year_month
0,user_159691.md,0.2031041829186236,2021-09
13,user_141248.md,0.2013699692776784,2021-09
25,user_90320.md,0.1973316587824816,2021-09
36,user_319429.md,0.1902412360925035,2021-09
46,user_21655.md,0.1874134376131074,2021-09
55,user_349876.md,0.1886259838916542,2021-09
63,user_36232.md,0.1840327314192075,2021-09
70,user_18696.md,0.1904267010496144,2021-09
76,user_191178.md,0.1863038996834294,2021-09
81,user_5754.md,0.20751268026246,2021-09


In [179]:
# P_t Effect (Treated Vs. Control Group Identification)
for i in range(len(loaded_objects)):
    if i <= 11:
        loaded_objects[i]['T_d'] = 0
    else:
        loaded_objects[i]['T_d'] = 1
# T_d Effect (Before Vs. After Treatment)
    # 1~4 <- 0 / 5~12 <- 1 / 13~16 <- 0 / 17~ 25 <- 1
for j in range(len(loaded_objects)):
    if j < 4:
        loaded_objects[j]['P_t'] = 0
    elif j < 12:
        loaded_objects[j]['P_t'] = 1
    elif j < 16:
        loaded_objects[j]['P_t'] = 0
    else:
        loaded_objects[j]['P_t'] = 1
# Merge Everything
result_df = pd.concat(loaded_objects, axis=0).reset_index(drop = True)
result_df = result_df.rename(columns = {'User1': 'User'})
result_df['User'] = result_df['User'].apply(lambda x: x.replace('.md', ''))
result_df = result_df[['User', 'year_month', 'Similarity_toAvg', 'T_d', 'P_t']]

# Save via Pickle
#with open(' ', 'wb') as fw:
#    pickle.dump(result_df, fw)

In [180]:
result_df

Unnamed: 0,User,year_month,Similarity_toAvg,T_d,P_t
0,user_159691,2021-09,0.2031041829186236,0,0
1,user_141248,2021-09,0.2013699692776784,0,0
2,user_90320,2021-09,0.19733165878248168,0,0
3,user_319429,2021-09,0.19024123609250354,0,0
4,user_21655,2021-09,0.18741343761310741,0,0
...,...,...,...,...,...
320,user_191178,2023-09,0.18387543878735166,1,1
321,user_5754,2023-09,0.18938630056380576,1,1
322,user_371337,2023-09,0.19470323486431573,1,1
323,user_299611,2023-09,0.19103835390745255,1,1


In [181]:
result_lsm = result_df # Due to code integration
result_lsm = result_lsm[result_lsm['year_month'] < '2023-09'] # Fixed date range
# convert types
result_lsm['Similarity_toAvg'] = pd.to_numeric(result_lsm['Similarity_toAvg'])
result_lsm['ln_y'] = np.log(result_lsm['Similarity_toAvg'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_lsm['Similarity_toAvg'] = pd.to_numeric(result_lsm['Similarity_toAvg'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_lsm['ln_y'] = np.log(result_lsm['Similarity_toAvg'])


In [182]:
result_lsm

Unnamed: 0,User,year_month,Similarity_toAvg,T_d,P_t,ln_y
0,user_159691,2021-09,0.203104,0,0,-1.594036
1,user_141248,2021-09,0.201370,0,0,-1.602611
2,user_90320,2021-09,0.197332,0,0,-1.622869
3,user_319429,2021-09,0.190241,0,0,-1.659462
4,user_21655,2021-09,0.187413,0,0,-1.674438
...,...,...,...,...,...,...
307,user_191178,2023-08,0.200107,1,1,-1.608902
308,user_5754,2023-08,0.196390,1,1,-1.627654
309,user_371337,2023-08,0.185666,1,1,-1.683806
310,user_299611,2023-08,0.195024,1,1,-1.634634


In [183]:
# Model fit
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(User)', result_lsm).fit(cov_type='HC3').summary()

# Save it to csv for Stata usage
#result_lsm.to_csv('lsm_did2.csv', index=False)

0,1,2,3
Dep. Variable:,ln_y,R-squared:,0.309
Model:,OLS,Adj. R-squared:,0.274
Method:,Least Squares,F-statistic:,7.452
Date:,"Thu, 02 May 2024",Prob (F-statistic):,5.16e-14
Time:,14:15:14,Log-Likelihood:,310.03
No. Observations:,312,AIC:,-588.1
Df Residuals:,296,BIC:,-528.2
Df Model:,15,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.6176,0.022,-74.255,0.000,-1.660,-1.575
C(User)[T.user_159691],-0.0433,0.028,-1.563,0.118,-0.098,0.011
C(User)[T.user_18696],-0.0428,0.023,-1.865,0.062,-0.088,0.002
C(User)[T.user_191178],-0.0595,0.027,-2.227,0.026,-0.112,-0.007
C(User)[T.user_21655],-0.0816,0.021,-3.955,0.000,-0.122,-0.041
C(User)[T.user_299611],0.1137,0.044,2.580,0.010,0.027,0.200
C(User)[T.user_319429],-0.0645,0.021,-3.072,0.002,-0.106,-0.023
C(User)[T.user_330962],-0.0847,0.021,-4.128,0.000,-0.125,-0.045
C(User)[T.user_349876],-0.0600,0.021,-2.795,0.005,-0.102,-0.018

0,1,2,3
Omnibus:,185.744,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1228.478
Skew:,2.48,Prob(JB):,1.74e-267
Kurtosis:,11.361,Cond. No.,19.3
