### Similarity Measure using LSM (English)

In [1]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pickle
import os
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/_Robustness/English/stack.db')
query = '''
SELECT creation_date, owner_user_id, body
FROM answers
WHERE creation_date >= '2021-09-01';
'''
df = pd.read_sql_query(query, conn)
conn.close()

In [2]:
# erase NAs
df_tags = df.dropna(subset=['owner_user_id'])
# Convert types
df_tags['owner_user_id'] = df_tags['owner_user_id'].astype(int).astype(str)
df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
# add year_month variable
df_tags['year_month'] = df_tags['creation_date'].dt.to_period('M')

# Get the total number of unique months
total_months = df_tags['year_month'].nunique()
# Count the number of unique months for each user
user_month_counts = df_tags.groupby('owner_user_id')['year_month'].nunique()
# Filter users who have written posts in every single month
users_in_every_month = user_month_counts[user_month_counts == total_months].index.tolist()

# Filtered data
df_filtered = df_tags[df_tags['owner_user_id'].isin(users_in_every_month)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['owner_user_id'] = df_tags['owner_user_id'].astype(int).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['year_month'] = df_tags['creation_date'].dt.to_period

In [3]:
df_filtered

Unnamed: 0,creation_date,owner_user_id,body,year_month
11,2021-09-01 13:50:26.877,90320,<p>Your second sentence is grammatically corre...,2021-09
13,2021-09-01 15:33:17.463,18696,<p><em>Dear</em> here is related to <em>dearth...,2021-09
14,2021-09-01 16:07:02.423,21655,<p>There's a 'backwards particularising' here....,2021-09
20,2021-09-01 20:47:35.080,36232,<p>I consulted a number of print dictionaries ...,2021-09
34,2021-09-02 10:13:19.073,21655,"<p>No, this does not preserve parallelism, whi...",2021-09
...,...,...,...,...
19636,2024-03-29 10:47:36.220,330962,<p>The relativiser “that” has been omitted in ...,2024-03
19638,2024-03-29 15:14:34.670,21655,"<p>This is inherently ambiguous, but pragmatic...",2024-03
19657,2024-03-31 21:59:08.953,330962,<blockquote>\n<p>Can &quot;albeit&quot; ever b...,2024-03
19658,2024-03-31 22:07:19.390,21655,"<p><a href=""https://www.scribbr.co.uk/definiti...",2024-03


In [4]:
len(df_filtered.owner_user_id.unique())

11

In [5]:
# Monthly aggregate for each user
df_grouped = df_filtered.groupby(['owner_user_id','year_month'])['body'].agg(lambda x: '\n'.join(x)).reset_index()
year_month = list(df_grouped['year_month'].unique().astype('str')) # get unique year_month
# Save individual body data into separate md file.
for i in range(len(year_month)):
    output_directory = f'/data1/StackOverflow/_Robustness/English/lsm_md/{year_month[i]}'
    os.makedirs(output_directory, exist_ok=True)
    data = df_grouped[df_grouped['year_month'] == year_month[i]]
    for index, row in data.iterrows():
        user_filename = f"{output_directory}/user_{row['owner_user_id']}.md"
        with open(user_filename, 'w') as md_file:
            md_file.write(f"## User {row['owner_user_id']}\n")
            md_file.write(row['body'] + '\n\n')
    print(f"Data has been saved to individual md files in the '{output_directory}' directory.")

Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2021-09' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2021-10' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2021-11' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2021-12' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2022-01' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2022-02' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2022-03' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/_Robustness/English/lsm_md/2022-04' directory.
Data has been saved to individual md files in th

### 2) LSM Score Calculation

In [6]:
import itertools
import sys
# Append the directory containing the LSM module to the Python path
sys.path.append('/data1/StackOverflow/language-style-matching-python')
from lib import LSM

In [7]:
# (Bottleneck)
# Load files in the folder.
for i in range(len(year_month)):
	for root, dirs, files in os.walk(f'/data1/StackOverflow/_Robustness/English/lsm_md/{year_month[i]}'): # parameter (change 'Final')
		folks = {}
		for folk in files:
			with open(os.path.join(root, folk), 'r') as f:
				folks[folk] = LSM(f.read()) # load via LSM class.

	combos = itertools.combinations(folks.items(), 2) 
	compares = []
	everybody = sum(folks.values())
	for obj1, obj2 in [combo for combo in combos]:
		compares.append([obj1[0], #obj2[0], 
			#str(obj1[1].compare(obj2[1])), # compare one-to-one.
			str(obj1[1].compare(everybody))]) # compare one-to-avg.


	col = ['User1', 'Similarity_toAvg']
	df = pd.DataFrame(compares, columns = col)
	print(f"{i} out of {len(year_month)} has been processed")

	# save via pickle
	with open(f'/data1/StackOverflow/_Robustness/English/lsm_md/{year_month[i]}.pickle', 'wb') as fw: # parameter (change 'Final_pickle')
		pickle.dump(df, fw)

0 out of 31 has been processed
1 out of 31 has been processed
2 out of 31 has been processed
3 out of 31 has been processed
4 out of 31 has been processed
5 out of 31 has been processed
6 out of 31 has been processed
7 out of 31 has been processed
8 out of 31 has been processed
9 out of 31 has been processed
10 out of 31 has been processed
11 out of 31 has been processed
12 out of 31 has been processed
13 out of 31 has been processed
14 out of 31 has been processed
15 out of 31 has been processed
16 out of 31 has been processed
17 out of 31 has been processed
18 out of 31 has been processed
19 out of 31 has been processed
20 out of 31 has been processed
21 out of 31 has been processed
22 out of 31 has been processed
23 out of 31 has been processed
24 out of 31 has been processed
25 out of 31 has been processed
26 out of 31 has been processed
27 out of 31 has been processed
28 out of 31 has been processed
29 out of 31 has been processed
30 out of 31 has been processed


In [9]:
folder_path = '/data1/StackOverflow/_Robustness/English/lsm_md'

# Get a list of all files in the folder
file_list = os.listdir(folder_path)
file_list = sorted(file_list)

# Filter only pickle files
pickle_files = [file for file in file_list if file.endswith('.pickle')]

# Iterate through each pickle file and load it into a separate object
loaded_objects = []
for pickle_file in pickle_files:
    file_path = os.path.join(folder_path, pickle_file)
    file_name = os.path.splitext(pickle_file)[0]
    
    with open(file_path, 'rb') as f:
        loaded_object = pickle.load(f)
        selected_indices = list(range(0, len(loaded_object), 308)) # indices of every 167 rows.
        loaded_object = loaded_object.iloc[selected_indices]
        loaded_object['year_month'] = file_name # add year_month value.
        loaded_objects.append(loaded_object)

In [11]:
loaded_objects[24]

Unnamed: 0,User1,Similarity_toAvg,year_month
0,user_141248.md,0.1921406498290503,2023-09


In [17]:
# P_t Effect (Treated Vs. Control Group Identification)
for i in range(len(loaded_objects)):
    if i <= 11:
        loaded_objects[i]['T_d'] = 0
    else:
        loaded_objects[i]['T_d'] = 1
# T_d Effect (Before Vs. After Treatment)
    # 1~4 <- 0 / 5~12 <- 1 / 13~16 <- 0 / 17~ 25 <- 1
for j in range(len(loaded_objects)):
    if j < 4:
        loaded_objects[j]['P_t'] = 0
    elif j < 12:
        loaded_objects[j]['P_t'] = 1
    elif j < 16:
        loaded_objects[j]['P_t'] = 0
    else:
        loaded_objects[j]['P_t'] = 1
# Merge Everything
result_df = pd.concat(loaded_objects, axis=0).reset_index(drop = True)
result_df = result_df.rename(columns = {'User1': 'User'})
result_df['User'] = result_df['User'].apply(lambda x: x.replace('.md', ''))
result_df = result_df[['User', 'year_month', 'Similarity_toAvg', 'T_d', 'P_t']]

# Save via Pickle
#with open(' ', 'wb') as fw:
#    pickle.dump(result_df, fw)

In [18]:
result_df

Unnamed: 0,User,year_month,Similarity_toAvg,T_d,P_t
0,user_141248,2021-09,0.2021809733669708,0,0
1,user_141248,2021-10,0.187515110643949,0,0
2,user_141248,2021-11,0.1949285774051695,0,0
3,user_141248,2021-12,0.1947856916391446,0,0
4,user_141248,2022-01,0.1893415350688891,0,1
5,user_141248,2022-02,0.1939914231180908,0,1
6,user_141248,2022-03,0.1943181251870329,0,1
7,user_141248,2022-04,0.2586302094824325,0,1
8,user_141248,2022-05,0.1938365279821237,0,1
9,user_141248,2022-06,0.2007901359772518,0,1


In [19]:
result_lsm = result_df # Due to code integration
result_lsm = result_lsm[result_lsm['year_month'] < '2023-09'] # Fixed date range
# convert types
result_lsm['Similarity_toAvg'] = pd.to_numeric(result_lsm['Similarity_toAvg'])
result_lsm['ln_y'] = np.log(result_lsm['Similarity_toAvg'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_lsm['Similarity_toAvg'] = pd.to_numeric(result_lsm['Similarity_toAvg'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_lsm['ln_y'] = np.log(result_lsm['Similarity_toAvg'])


In [20]:
result_lsm

Unnamed: 0,User,year_month,Similarity_toAvg,T_d,P_t,ln_y
0,user_141248,2021-09,0.202181,0,0,-1.598592
1,user_141248,2021-10,0.187515,0,0,-1.673896
2,user_141248,2021-11,0.194929,0,0,-1.635122
3,user_141248,2021-12,0.194786,0,0,-1.635855
4,user_141248,2022-01,0.189342,0,1,-1.664203
5,user_141248,2022-02,0.193991,0,1,-1.639941
6,user_141248,2022-03,0.194318,0,1,-1.638259
7,user_141248,2022-04,0.25863,0,1,-1.352356
8,user_141248,2022-05,0.193837,0,1,-1.64074
9,user_141248,2022-06,0.20079,0,1,-1.605495


In [21]:
# Model fit
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(User)', result_lsm).fit(cov_type='HC3').summary()

# Save it to csv for Stata usage
#result_lsm.to_csv('lsm_did2.csv', index=False)

0,1,2,3
Dep. Variable:,ln_y,R-squared:,0.099
Model:,OLS,Adj. R-squared:,-0.036
Method:,Least Squares,F-statistic:,0.3622
Date:,"Wed, 01 May 2024",Prob (F-statistic):,0.781
Time:,13:36:54,Log-Likelihood:,23.218
No. Observations:,24,AIC:,-38.44
Df Residuals:,20,BIC:,-33.72
Df Model:,3,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.6359,0.018,-92.152,0.000,-1.671,-1.601
T_d,0.0882,0.115,0.768,0.442,-0.137,0.313
P_t,0.0316,0.043,0.733,0.464,-0.053,0.116
T_d:P_t,-0.1161,0.122,-0.952,0.341,-0.355,0.123

0,1,2,3
Omnibus:,22.44,Durbin-Watson:,2.434
Prob(Omnibus):,0.0,Jarque-Bera (JB):,30.763
Skew:,1.999,Prob(JB):,2.09e-07
Kurtosis:,6.844,Cond. No.,8.44
