# Similarity Measure using LSM (Arqade)

In [1]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pickle
import os
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/_Robustness/Arqade/stack.db')
query = '''
SELECT creation_date, owner_user_id, body
FROM answers
WHERE creation_date >= '2021-09-01';
'''
df = pd.read_sql_query(query, conn)
conn.close()

In [2]:
df

Unnamed: 0,creation_date,owner_user_id,body
0,2021-09-01 01:14:47.137,279000.0,"<p>The punisher In Jiaming's answer is nice, b..."
1,2021-09-01 02:22:40.793,73976.0,<p>So I misunderstood the capabilities of the ...
2,2021-09-01 08:47:21.220,271559.0,<p>Unfortunately you can not import ANY assets...
3,2021-09-01 09:22:21.270,35555.0,<p>Steam natively allows choosing where to ins...
4,2021-09-01 11:28:17.450,9440.0,<p>It looks like somehow you switched off Smoo...
...,...,...,...
6276,2024-03-31 13:55:07.743,245340.0,<p>How to solve this is very similar to the an...
6277,2024-03-31 15:59:13.490,245340.0,"<h2>Outline</h2>\n<p>I have not used cheats, b..."
6278,2024-03-31 16:04:54.423,108916.0,<p>The default &quot;Maximum Number of Dropped...
6279,2024-03-31 18:27:47.853,181240.0,"<p><a href=""https://www.pikminwiki.com/Onion#W..."


In [3]:
# erase NAs
df_tags = df.dropna(subset=['owner_user_id'])
# Convert types
df_tags['owner_user_id'] = df_tags['owner_user_id'].astype(int).astype(str)
df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
# add year_month variable
df_tags['year_month'] = df_tags['creation_date'].dt.to_period('M')

# Get the total number of unique months
total_months = df_tags['year_month'].nunique()
# Count the number of unique months for each user
user_month_counts = df_tags.groupby('owner_user_id')['year_month'].nunique()
# Filter users who have written posts in every single month
users_in_every_month = user_month_counts[user_month_counts == total_months].index.tolist()

# Filtered data
df_filtered = df_tags[df_tags['owner_user_id'].isin(users_in_every_month)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['owner_user_id'] = df_tags['owner_user_id'].astype(int).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['year_month'] = df_tags['creation_date'].dt.to_period

In [4]:
df_filtered

Unnamed: 0,creation_date,owner_user_id,body,year_month
13,2021-09-01 20:54:10.763,87579,"<p>According to <a href=""https://kirby.fandom....",2021-09
14,2021-09-01 21:58:20.287,87579,<p>If that &quot;shrinking jar&quot; is the on...,2021-09
16,2021-09-02 00:59:25.427,87579,<p>They're DLC items. Maybe you've got them as...,2021-09
19,2021-09-02 12:25:03.123,87579,"<p>Here's a video tutorial, starting from 8.05...",2021-09
31,2021-09-03 16:24:52.073,87579,"<p>According to the <a href=""https://steamcomm...",2021-09
...,...,...,...,...
6177,2024-03-06 13:06:47.573,87579,"<p>It's like a threshold, you need to fill the...",2024-03
6180,2024-03-06 20:55:12.110,87579,"<p>Not an actual answer, but I found a solutio...",2024-03
6199,2024-03-10 22:11:34.730,87579,<p>You just need a command block under the pre...,2024-03
6200,2024-03-11 09:55:27.047,87579,<p>These are the games I can recognize (with Y...,2024-03


In [9]:
df_grouped = df_filtered.groupby(['owner_user_id','year_month'])['body'].agg(lambda x: '\n'.join(x)).reset_index()


In [10]:
df_grouped

Unnamed: 0,owner_user_id,year_month,body
0,87579,2021-09,"<p>According to <a href=""https://kirby.fandom...."
1,87579,2021-10,"<p>According to <a href=""https://www.ign.com/w..."
2,87579,2021-11,"<p>Apparently, it has nothing to do with quest..."
3,87579,2021-12,"<p>The <a href=""https://en.wikipedia.org/wiki/..."
4,87579,2022-01,"<p>It's <a href=""https://play.google.com/store..."
5,87579,2022-02,<p>They do stay loaded. But unfortunately farm...
6,87579,2022-03,<p>You need to consider that each quick move g...
7,87579,2022-04,"<p>This worked for me, of course next move is ..."
8,87579,2022-05,"<p>Charizard's standard ability is Blaze, whil..."
9,87579,2022-06,<p>There's no need to figure out a complicated...


In [11]:
year_month = list(df_grouped['year_month'].unique().astype('str')) # get unique year_month

In [12]:
year_month

['2021-09',
 '2021-10',
 '2021-11',
 '2021-12',
 '2022-01',
 '2022-02',
 '2022-03',
 '2022-04',
 '2022-05',
 '2022-06',
 '2022-07',
 '2022-08',
 '2022-09',
 '2022-10',
 '2022-11',
 '2022-12',
 '2023-01',
 '2023-02',
 '2023-03',
 '2023-04',
 '2023-05',
 '2023-06',
 '2023-07',
 '2023-08',
 '2023-09',
 '2023-10',
 '2023-11',
 '2023-12',
 '2024-01',
 '2024-02',
 '2024-03']