### Similarity Measure using LSM score

In [1]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pickle
import os
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT creation_date, owner_user_id, body
FROM answers
WHERE creation_date >= '2021-09-01';
'''
df_tags = pd.read_sql_query(query, conn)
conn.close()

### 1) Preprocessing

In [2]:
# erase NAs
df_tags = df_tags.dropna(subset=['owner_user_id'])
# Convert types
df_tags['owner_user_id'] = df_tags['owner_user_id'].astype(int).astype(str)
df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
# add year_month variable
df_tags['year_month'] = df_tags['creation_date'].dt.to_period('M')

# Get the total number of unique months
total_months = df_tags['year_month'].nunique()
# Count the number of unique months for each user
user_month_counts = df_tags.groupby('owner_user_id')['year_month'].nunique()
# Filter users who have written posts in every single month
users_in_every_month = user_month_counts[user_month_counts == total_months].index.tolist()

# Filtered data
df_filtered = df_tags[df_tags['owner_user_id'].isin(users_in_every_month)]

In [3]:
df_filtered

Unnamed: 0,creation_date,owner_user_id,body,year_month
6,2021-09-01 00:02:37.350,869736,<p>The error is correct. <code>getInputStream...,2021-09
8,2021-09-01 00:03:36.527,8690857,<p>You should not use <code>.map</code> for si...,2021-09
12,2021-09-01 00:04:33.053,16406,<p>The reason you need line B is called <a hre...,2021-09
48,2021-09-01 00:21:00.167,10527,<p>This is a REALLY good example as to why you...,2021-09
72,2021-09-01 00:34:54.043,5483526,<p>You can do a breadth-first search on the gr...,2021-09
...,...,...,...,...
3129967,2023-09-03 09:10:41.243,315168,<p>Here is another example for doing Pandas ch...,2023-09
3129971,2023-09-03 09:13:36.433,5044042,<p>This is a common class to get the current l...,2023-09
3129987,2023-09-03 09:30:12.230,2610061,<p>As a little &quot;Sunday morning at breakfa...,2023-09
3129994,2023-09-03 09:33:12.873,252228,"<p>Is <a href=""https://gdml.web.cern.ch/GDML/g...",2023-09


In [5]:
df_grouped = df_filtered.groupby(['owner_user_id','year_month'])['body'].agg(lambda x: '\n'.join(x)).reset_index()


In [6]:
df_grouped

Unnamed: 0,owner_user_id,year_month,body
0,10138734,2021-09,<blockquote>\n<p>What happens to BTREE index w...
1,10138734,2021-10,"<pre class=""lang-sql prettyprint-override""><co..."
2,10138734,2021-11,"<pre class=""lang-sql prettyprint-override""><co..."
3,10138734,2021-12,<ol>\n<li>In your SP there is a row</li>\n</ol...
4,10138734,2022-01,<p>From the performance looking point I recomm...
...,...,...,...
7720,992484,2023-05,<p>I just want to point out the obvious that <...
7721,992484,2023-06,"<p>So, based on this answer <a href=""https://s..."
7722,992484,2023-07,<p>All the <code>setBounds</code> and <code>se...
7723,992484,2023-08,<p>Decouple your concerns.</p>\n<p>Consider th...


In [4]:
# Monthly aggregate for each user
df_grouped = df_filtered.groupby(['owner_user_id','year_month'])['body'].agg(lambda x: '\n'.join(x)).reset_index()
year_month = list(df_grouped['year_month'].unique().astype('str')) # get unique year_month
# Save individual body data into separate md file.
for i in range(len(year_month)):
    output_directory = f'/data1/StackOverflow/language-style-matching-python/Final/{year_month[i]}'
    os.makedirs(output_directory, exist_ok=True)
    data = df_grouped[df_grouped['year_month'] == year_month[i]]
    for index, row in data.iterrows():
        user_filename = f"{output_directory}/user_{row['owner_user_id']}.md"
        with open(user_filename, 'w') as md_file:
            md_file.write(f"## User {row['owner_user_id']}\n")
            md_file.write(row['body'] + '\n\n')
    print(f"Data has been saved to individual md files in the '{output_directory}' directory.")

Data has been saved to individual md files in the '/data1/StackOverflow/language-style-matching-python/Final/2021-09' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/language-style-matching-python/Final/2021-10' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/language-style-matching-python/Final/2021-11' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/language-style-matching-python/Final/2021-12' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/language-style-matching-python/Final/2022-01' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/language-style-matching-python/Final/2022-02' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/language-style-matching-python/Final/2022-03' directory.
Data has been saved to individual md files in the '/data1/StackOverflow/language-style-matching-p

### 2) LSM Score Calculation

In [23]:
import itertools
import sys
# Append the directory containing the LSM module to the Python path
sys.path.append('/data1/StackOverflow/language-style-matching-python')
from lib import LSM

In [None]:
# (Bottleneck)
# Load files in the folder.
for i in range(len(year_month)):
	for root, dirs, files in os.walk(f'/data1/StackOverflow/language-style-matching-python/Final/{year_month[i]}'): # parameter (change 'Final')
		folks = {}
		for folk in files:
			with open(os.path.join(root, folk), 'r') as f:
				folks[folk] = LSM(f.read()) # load via LSM class.

	combos = itertools.combinations(folks.items(), 2) 
	compares = []
	everybody = sum(folks.values())
	for obj1, obj2 in [combo for combo in combos]:
		compares.append([obj1[0], #obj2[0], 
			#str(obj1[1].compare(obj2[1])), # compare one-to-one.
			str(obj1[1].compare(everybody))]) # compare one-to-avg.


	col = ['User1', 'Similarity_toAvg']
	df = pd.DataFrame(compares, columns = col)
	print(f"{i} out of {len(year_month)} has been processed")

	# save via pickle
	with open(f'/data1/StackOverflow/language-style-matching-python/Final_pickle/{year_month[i]}.pickle', 'wb') as fw: # parameter (change 'Final_pickle')
		pickle.dump(df, fw)

In [3]:
folder_path = '/data1/StackOverflow/language-style-matching-python/Final_pickle'

# Get a list of all files in the folder
file_list = os.listdir(folder_path)
file_list = sorted(file_list)

# Filter only pickle files
pickle_files = [file for file in file_list if file.endswith('.pickle')]

# Iterate through each pickle file and load it into a separate object
loaded_objects = []
for pickle_file in pickle_files:
    file_path = os.path.join(folder_path, pickle_file)
    file_name = os.path.splitext(pickle_file)[0]
    
    with open(file_path, 'rb') as f:
        loaded_object = pickle.load(f)
        selected_indices = list(range(0, len(loaded_object), 308)) # indices of every 167 rows.
        loaded_object = loaded_object.iloc[selected_indices]
        loaded_object['year_month'] = file_name # add year_month value.
        loaded_objects.append(loaded_object)

In [None]:
loaded_objects[24]

### 3) DiD Setting

In [None]:
# P_t Effect (Treated Vs. Control Group Identification)
for i in range(len(loaded_objects)):
    if i <= 11:
        loaded_objects[i]['T_d'] = 0
    else:
        loaded_objects[i]['T_d'] = 1
# T_d Effect (Before Vs. After Treatment)
    # 1~4 <- 0 / 5~12 <- 1 / 13~16 <- 0 / 17~ 25 <- 1
for j in range(len(loaded_objects)):
    if j < 4:
        loaded_objects[j]['P_t'] = 0
    elif j < 12:
        loaded_objects[j]['P_t'] = 1
    elif j < 16:
        loaded_objects[j]['P_t'] = 0
    else:
        loaded_objects[j]['P_t'] = 1
# Merge Everything
result_df = pd.concat(loaded_objects, axis=0).reset_index(drop = True)
result_df = result_df.rename(columns = {'User1': 'User'})
result_df['User'] = result_df['User'].apply(lambda x: x.replace('.md', ''))
result_df = result_df[['User', 'year_month', 'Similarity_toAvg', 'T_d', 'P_t']]

# Save via Pickle
#with open(' ', 'wb') as fw:
#    pickle.dump(result_df, fw)

### 4) Model Fitting

In [None]:
result_lsm = result_df # Due to code integration
result_lsm = result_lsm[result_lsm['year_month'] != '2023-09'] # Fixed date range
# convert types
result_lsm['Similarity_toAvg'] = pd.to_numeric(result_lsm['Similarity_toAvg'])
result_lsm['ln_y'] = np.log(result_lsm['Similarity_toAvg'])

In [None]:
# Model fit
sm.ols('ln_y ~ T_d + P_t + T_d * P_t + C(User)', result_lsm).fit(cov_type='HC3').summary()

# Save it to csv for Stata usage
#result_lsm.to_csv('lsm_did2.csv', index=False)