### Analyze UserCluster.ipynb file
- 일단 LSM 기본 계산한 것에 대한 유저들 (150?)을 기준으로 얘들을 old 랑 new로 나눠봐야할 듯.

In [1]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pickle
import os
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, creation_date, owner_user_id, body, parent_id
FROM answers
WHERE creation_date > '2021-09-01'
AND creation_date < '2023-09-01';
'''
df = pd.read_sql_query(query, conn)
conn.close()

In [2]:
# erase NAs
df = df.dropna(subset=['owner_user_id'])
# Convert types
df['owner_user_id'] = df['owner_user_id'].astype(int).astype(str)
df['parent_id'] = df['parent_id'].astype(int).astype(str)
df['creation_date'] = pd.to_datetime(df['creation_date'])

# add year_month variable
df['year_month'] = df['creation_date'].dt.to_period('M')

# Get the total number of unique months
total_months = df['year_month'].nunique()
# Count the number of unique months for each user
user_month_counts = df.groupby('owner_user_id')['year_month'].nunique()
# Filter users who have written posts in every single month
users_in_every_month = user_month_counts[user_month_counts == total_months].index.tolist()

# Filtered data
df_filtered = df[df['owner_user_id'].isin(users_in_every_month)]
df_filtered

Unnamed: 0,id,creation_date,owner_user_id,body,parent_id,year_month
6,69006435,2021-09-01 00:02:37.350,869736,<p>The error is correct. <code>getInputStream...,69006289,2021-09
8,69006438,2021-09-01 00:03:36.527,8690857,<p>You should not use <code>.map</code> for si...,69005880,2021-09
12,69006446,2021-09-01 00:04:33.053,16406,<p>The reason you need line B is called <a hre...,69005820,2021-09
16,69006451,2021-09-01 00:05:09.193,478884,<p>For example you could do it this way:</p>\n...,69006345,2021-09
29,69006475,2021-09-01 00:09:56.233,12957340,<p>Here is another potential solution using ti...,69005901,2021-09
...,...,...,...,...,...,...
3124903,77019800,2023-08-31 23:36:49.467,2156621,<p><code>StreamZip</code> + <code>expand</code...,77005145,2023-08
3124915,77019822,2023-08-31 23:44:48.760,209103,"<p>The <a href=""https://firebase.google.com/do...",77019651,2023-08
3124917,77019826,2023-08-31 23:46:31.700,939860,<p>You don't need a &quot;common key&quot; for...,77019724,2023-08
3124920,77019833,2023-08-31 23:49:08.340,589924,<p>The <code>sprintf</code> operator behaves l...,33897817,2023-08


In [3]:
# Import Questions data
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, tags
FROM questions
WHERE creation_date > '2020-09-01'
AND creation_date < '2023-09-01';
'''
df_tags = pd.read_sql_query(query, conn)
conn.close()

In [4]:
df_tags['id'] = df_tags['id'].astype(str)
df_merge = pd.merge(df_filtered, df_tags, left_on = 'parent_id', right_on = 'id', how = 'left')
df_merge.drop('id_y', axis = 1, inplace = True)
df_merge = df_merge.rename(columns = {'id_x': 'id'})
df_merge

Unnamed: 0,id,creation_date,owner_user_id,body,parent_id,year_month,tags
0,69006435,2021-09-01 00:02:37.350,869736,<p>The error is correct. <code>getInputStream...,69006289,2021-09,"[""java"", ""eclipse"", ""memory-leaks"", ""io""]"
1,69006438,2021-09-01 00:03:36.527,8690857,<p>You should not use <code>.map</code> for si...,69005880,2021-09,"[""javascript"", ""arrays"", ""reactjs"", ""map-funct..."
2,69006446,2021-09-01 00:04:33.053,16406,<p>The reason you need line B is called <a hre...,69005820,2021-09,"[""c++"", ""class"", ""pointers"", ""inheritance"", ""r..."
3,69006451,2021-09-01 00:05:09.193,478884,<p>For example you could do it this way:</p>\n...,69006345,2021-09,"[""excel"", ""vba"", ""parsing"", ""offset""]"
4,69006475,2021-09-01 00:09:56.233,12957340,<p>Here is another potential solution using ti...,69005901,2021-09,"[""r""]"
...,...,...,...,...,...,...,...
437024,77019800,2023-08-31 23:36:49.467,2156621,<p><code>StreamZip</code> + <code>expand</code...,77005145,2023-08,"[""dart""]"
437025,77019822,2023-08-31 23:44:48.760,209103,"<p>The <a href=""https://firebase.google.com/do...",77019651,2023-08,"[""javascript"", ""firebase"", ""next.js"", ""server-..."
437026,77019826,2023-08-31 23:46:31.700,939860,<p>You don't need a &quot;common key&quot; for...,77019724,2023-08,"[""sql"", ""postgresql"", ""join""]"
437027,77019833,2023-08-31 23:49:08.340,589924,<p>The <code>sprintf</code> operator behaves l...,33897817,2023-08,


In [5]:
df_merge['tags'] = df_merge['tags'].str.replace('[\["\]]', '', regex=True)
df_merge['tags'] = df_merge['tags'].str.replace('c#','Csharp')
df_merge['tags'] = df_merge['tags'].str.replace('c++','Cpp')
df_merge['tags'] = df_merge['tags'].str.replace('.','DOT')
df_merge['tags'] = df_merge['tags'].str.replace('><',' ')
df_merge['tags'] = df_merge['tags'].str.replace('>','')
df_merge['tags'] = df_merge['tags'].str.replace('-','')
df_merge['tags'] = df_merge['tags'].str.replace('"', '')

In [6]:
# 3-1 Complete
new_merge = df_merge.groupby('owner_user_id')[['tags']].agg(lambda x: ', '.join(map(str,x))).reset_index()
new_merge

Unnamed: 0,owner_user_id,tags
0,10008173,"docker, filesystems, docker, express, dockerfi..."
1,10112124,"angular, authentication, rxjs, ngrx, ngrxentit..."
2,10138734,"mysql, rubyonrails, ruby, railsmigrations, mys..."
3,10140124,"elasticsearch, javascript, ifstatement, boolea..."
4,10157127,"flutter, flutterlayout, flutterdependencies, f..."
...,...,...
790,9952196,"sqlite, linux, bash, shell, sed, linux, bash, ..."
791,9957710,"arrays, struct, julia, arrays, julia, multithr..."
792,99692,"rust, rust, enums, io, rust, rust, lifetime, o..."
793,9971759,"python, ortools, python, constraints, nonlinea..."


### User Creation Date Info

In [7]:
# Import Users Data
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT id, creation_date
FROM users;
'''
user_df = pd.read_sql_query(query, conn)
conn.close()

In [8]:
user_df

Unnamed: 0,id,creation_date
0,-1016,2023-06-28 20:22:59.967
1,-1015,2023-06-28 18:50:43.493
2,-1014,2023-02-17 19:52:56.213
3,-1013,2023-02-17 19:25:19.953
4,-1012,2023-02-15 23:24:13.363
...,...,...
21077275,22492432,2023-09-03 11:44:44.337
21077276,22492433,2023-09-03 11:44:49.580
21077277,22492434,2023-09-03 11:44:57.507
21077278,22492435,2023-09-03 11:45:03.880


In [10]:
new_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   owner_user_id  795 non-null    object
 1   tags           795 non-null    object
dtypes: object(2)
memory usage: 12.5+ KB


In [13]:
user_df['id'] = user_df['id'].astype('str')

In [16]:
df_final =  pd.merge(new_merge, user_df, left_on = 'owner_user_id', right_on= 'id', how = 'left')
df_final

Unnamed: 0,owner_user_id,tags,id,creation_date
0,10008173,"docker, filesystems, docker, express, dockerfi...",10008173,2018-06-28 22:58:51.250
1,10112124,"angular, authentication, rxjs, ngrx, ngrxentit...",10112124,2018-07-20 16:58:43.987
2,10138734,"mysql, rubyonrails, ruby, railsmigrations, mys...",10138734,2018-07-26 11:23:55.560
3,10140124,"elasticsearch, javascript, ifstatement, boolea...",10140124,2018-07-26 15:50:25.917
4,10157127,"flutter, flutterlayout, flutterdependencies, f...",10157127,2018-07-30 19:07:40.430
...,...,...,...,...
790,9952196,"sqlite, linux, bash, shell, sed, linux, bash, ...",9952196,2018-06-17 06:19:32.710
791,9957710,"arrays, struct, julia, arrays, julia, multithr...",9957710,2018-06-18 16:03:36.957
792,99692,"rust, rust, enums, io, rust, rust, lifetime, o...",99692,2009-05-02 00:48:46.727
793,9971759,"python, ortools, python, constraints, nonlinea...",9971759,2018-06-21 08:00:06.347


In [21]:
df_final['year_month'] = pd.to_datetime(df_final['creation_date']).dt.to_period('M')
df_final

Unnamed: 0,owner_user_id,tags,id,creation_date,year_month
0,10008173,"docker, filesystems, docker, express, dockerfi...",10008173,2018-06-28 22:58:51.250,2018-06
1,10112124,"angular, authentication, rxjs, ngrx, ngrxentit...",10112124,2018-07-20 16:58:43.987,2018-07
2,10138734,"mysql, rubyonrails, ruby, railsmigrations, mys...",10138734,2018-07-26 11:23:55.560,2018-07
3,10140124,"elasticsearch, javascript, ifstatement, boolea...",10140124,2018-07-26 15:50:25.917,2018-07
4,10157127,"flutter, flutterlayout, flutterdependencies, f...",10157127,2018-07-30 19:07:40.430,2018-07
...,...,...,...,...,...
790,9952196,"sqlite, linux, bash, shell, sed, linux, bash, ...",9952196,2018-06-17 06:19:32.710,2018-06
791,9957710,"arrays, struct, julia, arrays, julia, multithr...",9957710,2018-06-18 16:03:36.957,2018-06
792,99692,"rust, rust, enums, io, rust, rust, lifetime, o...",99692,2009-05-02 00:48:46.727,2009-05
793,9971759,"python, ortools, python, constraints, nonlinea...",9971759,2018-06-21 08:00:06.347,2018-06


In [34]:
df_final['year_month'].value_counts().sort_index()

year_month
2008-08     7
2008-09    27
2008-10     2
2008-11     1
2008-12     3
           ..
2021-04     1
2021-06     2
2021-07     4
2021-08     2
2021-09     2
Freq: M, Name: count, Length: 154, dtype: int64

In [40]:
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer

def preprocess_text(text):
    # Remove punctuation and convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

def get_function_word_categories():
    # Define function words categorized by linguistic type
    return {
        "pronouns": ["i", "me", "my", "mine", "you", "your", "yours", "he", "him", "his", 
                     "she", "her", "hers", "we", "us", "our", "ours", "they", "them", 
                     "their", "theirs", "it", "its"],
        "articles": ["a", "an", "the"],
        "conjunctions": ["and", "but", "or", "so", "because", "if", "when", "where", "while"],
        "prepositions": ["for", "of", "at", "by", "with", "about", "to", "in", "on"],
    }

def calculate_fwu(text, function_words):
    # Preprocess text and calculate total word count
    text = preprocess_text(text)
    total_words = len(text.split())
    
    # Initialize vectorizer and count function words
    vectorizer = CountVectorizer(vocabulary=function_words)
    fwu_counts = vectorizer.fit_transform([text]).toarray().flatten()
    
    # Normalize function word usage by total words
    fwu_normalized = fwu_counts / total_words
    return fwu_normalized

def calculate_lsm_by_category(text_individual, text_community):
    # Get categorized function words
    categories = get_function_word_categories()
    
    # Initialize dictionary to store LSM scores by category
    lsm_scores_by_category = {}
    
    # Calculate LSM for each category
    for category, function_words in categories.items():
        # Calculate FWU for individual and community texts within each category
        fwu_individual = calculate_fwu(text_individual, function_words)
        fwu_community = calculate_fwu(text_community, function_words)
        
        # Avoid division by zero by replacing zero counts with a small constant
        fwu_individual = np.where(fwu_individual == 0, 1e-10, fwu_individual)
        fwu_community = np.where(fwu_community == 0, 1e-10, fwu_community)
        
        # Calculate LSM for the current category
        lsm_scores = 1 - (np.abs(fwu_individual - fwu_community) / (fwu_individual + fwu_community))
        lsm_scores_by_category[category] = np.mean(lsm_scores)  # Average LSM score for the category
    
    # Calculate the overall LSM score as the mean of all category scores
    overall_lsm_score = np.mean(list(lsm_scores_by_category.values()))
    return overall_lsm_score, lsm_scores_by_category

# Sample usage
text_individual = "I enjoy working with people, and I believe we can do great things together."
text_community = "We believe that working with others can help us achieve great things together."

overall_lsm_score, lsm_scores_by_category = calculate_lsm_by_category(text_individual, text_community)

print(f"Overall Linguistic Style Match (LSM) score: {overall_lsm_score:.2f}")
print("LSM Scores by Category:")
for category, score in lsm_scores_by_category.items():
    print(f"  {category}: {score:.2f}")


Overall Linguistic Style Match (LSM) score: 0.96
LSM Scores by Category:
  pronouns: 0.95
  articles: 1.00
  conjunctions: 0.89
  prepositions: 1.00


# Final Version of LSM

In [41]:
# Multiple users Setting
import numpy as np
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

def preprocess_text(text):
    # Remove punctuation and convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

def get_function_word_categories():
    # Define function words categorized by linguistic type
    return {
        "pronouns": ["i", "me", "my", "mine", "you", "your", "yours", "he", "him", "his", 
                     "she", "her", "hers", "we", "us", "our", "ours", "they", "them", 
                     "their", "theirs", "it", "its"],
        "articles": ["a", "an", "the"],
        "conjunctions": ["and", "but", "or", "so", "because", "if", "when", "where", "while"],
        "prepositions": ["for", "of", "at", "by", "with", "about", "to", "in", "on"],
    }

def calculate_fwu(text, function_words):
    # Preprocess text and calculate total word count
    text = preprocess_text(text)
    total_words = len(text.split())
    
    # Initialize vectorizer and count function words
    vectorizer = CountVectorizer(vocabulary=function_words)
    fwu_counts = vectorizer.fit_transform([text]).toarray().flatten()
    
    # Normalize function word usage by total words
    fwu_normalized = fwu_counts / total_words
    return fwu_normalized

def calculate_community_fwu(texts):
    # Calculate the community-level FWU by averaging FWU across all users' texts
    categories = get_function_word_categories()
    community_fwu = {}
    
    for category, function_words in categories.items():
        category_fwu = np.array([calculate_fwu(text, function_words) for text in texts])
        community_fwu[category] = np.mean(category_fwu, axis=0)  # Average FWU across all texts
    
    return community_fwu

def calculate_lsm_for_all_users(texts):
    # Get categorized function words
    categories = get_function_word_categories()
    
    # Calculate community FWU
    community_fwu = calculate_community_fwu(texts)
    
    # Calculate LSM scores for each user
    lsm_scores_all_users = []
    
    for i, text in enumerate(texts):
        individual_lsm_scores = {}
        for category, function_words in categories.items():
            fwu_individual = calculate_fwu(text, function_words)
            fwu_community = community_fwu[category]
            
            # Avoid division by zero by replacing zeros with a small constant
            fwu_individual = np.where(fwu_individual == 0, 1e-10, fwu_individual)
            fwu_community = np.where(fwu_community == 0, 1e-10, fwu_community)
            
            # Calculate LSM for the current category
            lsm_scores = 1 - (np.abs(fwu_individual - fwu_community) / (fwu_individual + fwu_community))
            individual_lsm_scores[category] = np.mean(lsm_scores)  # Average LSM score for the category
        
        # Calculate the overall LSM score as the mean of all category scores
        overall_lsm_score = np.mean(list(individual_lsm_scores.values()))
        individual_lsm_scores["overall"] = overall_lsm_score
        
        lsm_scores_all_users.append({"user_id": i, **individual_lsm_scores})
    
    return pd.DataFrame(lsm_scores_all_users)

# Sample usage
texts = [
    "I enjoy working with people, and I believe we can do great things together.",
    "We believe that working with others can help us achieve great things together.",
    "Working with a team is something I find beneficial for reaching our goals.",
]

lsm_scores_df = calculate_lsm_for_all_users(texts)
print(lsm_scores_df)


   user_id  pronouns  articles  conjunctions  prepositions   overall
0        0  0.905138       1.0      0.944444      0.886111  0.933923
1        1  0.925331       1.0      0.888889      0.887550  0.925442
2        2  0.891304       1.0      0.888889      0.943106  0.930825


In [43]:
# Import Modules
import pandas as pd
import numpy as np
import sqlite3
import pickle
import os
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
# Import Dataset
conn = sqlite3.connect('/data1/StackOverflow/stackexchange-to-sqlite/stack.db')
query = '''
SELECT creation_date, owner_user_id, body
FROM answers
WHERE creation_date >= '2021-09-01';
'''
df_tags = pd.read_sql_query(query, conn)
conn.close()

In [44]:
df_tags

Unnamed: 0,creation_date,owner_user_id,body
0,2021-09-01 00:00:18.070,7332046.0,"<p>The short approach, remove python3.9 from y..."
1,2021-09-01 00:00:28.823,14471093.0,"<p>open the terminal, run as administrator.</p..."
2,2021-09-01 00:00:35.537,523612.0,"<p>Conceptually, <code>Base</code> has no reas..."
3,2021-09-01 00:01:09.140,11269158.0,<p><code>calldata</code> is a special data loc...
4,2021-09-01 00:01:17.920,2096113.0,<p>You could do something like this if the str...
...,...,...,...
3129999,2023-09-03 09:34:35.727,22492027.0,<p>it is highly likely that this issue has bee...
3130000,2023-09-03 09:34:38.237,16616583.0,<p>To add an image to every page in a PDF docu...
3130001,2023-09-03 09:34:43.133,19978835.0,<p>Adding an audio to a video using ffmpeg has...
3130002,2023-09-03 09:34:49.570,3518621.0,"<p>In Go, when you send a value on a channel (..."


In [45]:
# erase NAs
df_tags = df_tags.dropna(subset=['owner_user_id'])
# Convert types
df_tags['owner_user_id'] = df_tags['owner_user_id'].astype(int).astype(str)
df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
# add year_month variable
df_tags['year_month'] = df_tags['creation_date'].dt.to_period('D')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['owner_user_id'] = df_tags['owner_user_id'].astype(int).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['creation_date'] = pd.to_datetime(df_tags['creation_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags['year_month'] = df_tags['creation_date'].dt.to_period

In [46]:
df_tags

Unnamed: 0,creation_date,owner_user_id,body,year_month
0,2021-09-01 00:00:18.070,7332046,"<p>The short approach, remove python3.9 from y...",2021-09-01
1,2021-09-01 00:00:28.823,14471093,"<p>open the terminal, run as administrator.</p...",2021-09-01
2,2021-09-01 00:00:35.537,523612,"<p>Conceptually, <code>Base</code> has no reas...",2021-09-01
3,2021-09-01 00:01:09.140,11269158,<p><code>calldata</code> is a special data loc...,2021-09-01
4,2021-09-01 00:01:17.920,2096113,<p>You could do something like this if the str...,2021-09-01
...,...,...,...,...
3129999,2023-09-03 09:34:35.727,22492027,<p>it is highly likely that this issue has bee...,2023-09-03
3130000,2023-09-03 09:34:38.237,16616583,<p>To add an image to every page in a PDF docu...,2023-09-03
3130001,2023-09-03 09:34:43.133,19978835,<p>Adding an audio to a video using ffmpeg has...,2023-09-03
3130002,2023-09-03 09:34:49.570,3518621,"<p>In Go, when you send a value on a channel (...",2023-09-03


In [77]:
df_filtered = df_tags[df_tags['year_month'] == '2023-09-02']
df_filtered

Unnamed: 0,creation_date,owner_user_id,body,year_month
3127738,2023-09-02 00:04:20.820,9973992,<p>you must implement this fuction apiCallToGe...,2023-09-02
3127739,2023-09-02 00:05:42.803,5460719,<p>You can use <code>ast.literal_eval</code> t...,2023-09-02
3127740,2023-09-02 00:06:04.863,12121118,<p>This works without imports:</p>\n<pre><code...,2023-09-02
3127741,2023-09-02 00:09:08.750,2700303,"<p>The solutions is present in this <a href=""h...",2023-09-02
3127742,2023-09-02 00:09:11.563,945871,"<p>@Rick,</p>\n<p>Please post how you added th...",2023-09-02
...,...,...,...,...
3129404,2023-09-02 23:53:30.633,1924653,<p>Your <code>render</code> function call in y...,2023-09-02
3129405,2023-09-02 23:57:38.950,12678230,"<p>For me, what happened is that a package was...",2023-09-02
3129406,2023-09-02 23:57:59.430,22486404,<p>You can set Firebase database rules to not ...,2023-09-02
3129407,2023-09-02 23:59:19.437,14810367,<p>So to achieve the exact same behaviour whic...,2023-09-02


In [78]:
df_grouped = df_filtered.groupby(['owner_user_id','year_month'])['body'].agg(lambda x: '\n'.join(x)).reset_index()
df_grouped

Unnamed: 0,owner_user_id,year_month,body
0,10004574,2023-09-02,<p>If you are using next and you have to pass ...
1,10017792,2023-09-02,<p>I had to change my <code>.env.local</code> ...
2,1003538,2023-09-02,"<p>If you want a general solution, you could c..."
3,10035985,2023-09-02,"<p>You can do it without regex, using operator..."
4,10059628,2023-09-02,<p>Try this:</p>\n<pre><code>\nserver {\n l...
...,...,...,...
1323,992484,2023-09-02,"<p>First, you want to decouple your code. The..."
1324,9957807,2023-09-02,<p>Do <code>gradle build</code> or <code>./gra...
1325,995853,2023-09-02,<p>You can use a leaflet plugin for context me...
1326,9973992,2023-09-02,<p>you must implement this fuction apiCallToGe...


In [79]:
df_grouped['body']

0       <p>If you are using next and you have to pass ...
1       <p>I had to change my <code>.env.local</code> ...
2       <p>If you want a general solution, you could c...
3       <p>You can do it without regex, using operator...
4       <p>Try this:</p>\n<pre><code>\nserver {\n    l...
                              ...                        
1323    <p>First, you want to decouple your code.  The...
1324    <p>Do <code>gradle build</code> or <code>./gra...
1325    <p>You can use a leaflet plugin for context me...
1326    <p>you must implement this fuction apiCallToGe...
1327    <p>To open bottom sheet dialog use following c...
Name: body, Length: 1328, dtype: object

In [80]:
# Sample usage
lsm_scores_df = calculate_lsm_for_all_users(df_grouped['body'])
print(lsm_scores_df)

      user_id  pronouns  articles  conjunctions  prepositions   overall
0           0  0.238672  0.541684  9.010265e-02  1.429857e-01  0.253361
1           1  0.225651  0.666337  2.659299e-02  2.086747e-01  0.281814
2           2  0.238053  0.333333  1.336048e-07  1.179940e-01  0.172345
3           3  0.295588  0.567924  7.550730e-02  1.538234e-01  0.273211
4           4  0.217408  0.333333  1.336048e-07  8.901972e-08  0.137685
...       ...       ...       ...           ...           ...       ...
1323     1323  0.362563  0.955729  6.210942e-01  3.485503e-01  0.571984
1324     1324  0.217408  0.333333  6.110060e-02  1.211428e-01  0.183246
1325     1325  0.217408  0.333333  1.336048e-07  2.304455e-02  0.143447
1326     1326  0.217408  0.333333  1.336048e-07  8.901972e-08  0.137685
1327     1327  0.253842  0.333333  1.336048e-07  9.117924e-02  0.169589

[1328 rows x 6 columns]


In [81]:
lsm_scores_df['overall'].mean()

0.3059650890573768

In [73]:
df_grouped['lsm_score'] = lsm_scores_df['overall']

In [74]:
df_grouped

Unnamed: 0,owner_user_id,year_month,body,lsm_score
0,10000229,2021-09-01,<p>One thing that I've noticed in your code is...,0.394769
1,10002945,2021-09-01,<p>I am sure the above answer is very good and...,0.360823
2,10008173,2021-09-01,<p>Since you already know the location on the ...,0.608867
3,10008643,2021-09-01,<p>Replace &quot;justify-content-center&quot; ...,0.117142
4,10009777,2021-09-01,<p>(might need to adjust syntax according to y...,0.233345
...,...,...,...,...
4247,998832,2021-09-01,<p>I liked the modal interface and I was tryin...,0.204038
4248,9991242,2021-09-01,<p>I had a similar and as I am new to Cloud Fo...,0.428743
4249,999223,2021-09-01,"<p>This was incredibly hard to debug, pure luc...",0.271997
4250,9994727,2021-09-01,<p>If you are looking to validate if lines are...,0.250005


In [75]:
df_grouped['lsm_score'].mean()

0.2810059407401412