Investigating correlation between the features.


In [7]:
import json
import pandas as pd

# Initialize an empty list to store the valid JSON data
data = []

# Open the .jsonl file and attempt to load each line as a JSON object
with open('../2_months_data/filtered_posts.jsonl', 'r') as file:
    for line in file:
        try:
            # Attempt to load each line as a JSON object
            json_data = json.loads(line)
            # Add the loaded data to the list
            data.append(json_data)
        except json.JSONDecodeError:
            # Handle errors when the line can't be decoded
            print("Error decoding JSON on line:", line)

# Convert the list of JSON objects into a DataFrame
df_posts = pd.DataFrame(data)




In [11]:
df_posts.dtypes
df_posts['total_awards_received'] = pd.to_numeric(df_posts['total_awards_received'], errors='coerce')

In [None]:
# Select the features you want to analyze
features = ['num_comments', 'score', 'num_crossposts','upvote_ratio']

# Calculate the correlation matrix for these features
correlation_matrix = df_posts[features].corr()

# Display the correlation matrix
print(correlation_matrix)

                num_comments     score  num_crossposts  upvote_ratio
num_comments        1.000000  0.703272        0.529757      0.004699
score               0.703272  1.000000        0.644115      0.033805
num_crossposts      0.529757  0.644115        1.000000      0.022793
upvote_ratio        0.004699  0.033805        0.022793      1.000000


num_comments and score: These two features are highly correlated (0.70), meaning they both capture similar information. Including both of them could lead to redundancy in your custom engagement score. So will be dropping score. finalized matric:

df_posts['engagement_score'] = (df_posts['num_comments'] * 0.5) + (df_posts['num_crossposts'] * 0.3) + (df_posts['upvote_ratio'] * 0.2)

In [None]:
import json
import pandas as pd

def calculate_engagement(file_path):
    # Load the JSONL file into a DataFrame
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)

    # Calculate the engagement score
    df['engagement_score'] = (df['num_comments'] * 0.5) + (df['num_crossposts'] * 0.3) + (df['upvote_ratio'] * 0.2)

    # Convert 'created_utc' to date
    df['date'] = pd.to_datetime(df['created_utc'], unit='s').dt.date

    # Find the top post for each day based on the engagement score
    top_posts = df.loc[df.groupby('date')['engagement_score'].idxmax()]

    # Print the results
    for _, row in top_posts.iterrows():
        print(f"Date: {row['date']}, Title: {row['title']}, Engagement Score: {row['engagement_score']}, URL: {row['url']}")

In [4]:
import json
import pandas as pd

def calculate_engagement(input_file_path, output_file_path):
    # Load the JSONL file into a DataFrame
    data = []
    with open(input_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)

    # Calculate the engagement score
    df['engagement_score'] = (df['num_comments'] * 0.5) + (df['num_crossposts'] * 0.3) + (df['upvote_ratio'] * 0.2)

    # Convert 'created_utc' to actual date
    df['date'] = pd.to_datetime(df['created_utc'], unit='s').dt.date

    # Find the top 3 posts for each day based on the engagement score
    top_posts = df.loc[df.groupby('date')['engagement_score'].nlargest(3).index.get_level_values(1)]

    # Select the required columns
    top_posts = top_posts[['url', 'selftext', 'title', 'date', 'id', 'author', 'author_fullname', 'engagement_score']]

    # Convert the 'date' column to string format to make it JSON serializable
    top_posts['date'] = top_posts['date'].astype(str)

    # Convert the DataFrame to a list of dictionaries
    top_posts_list = top_posts.to_dict(orient='records')

    # Save the results to a JSON file
    with open(output_file_path, 'w', encoding='utf-8') as out_file:
        json.dump(top_posts_list, out_file, ensure_ascii=False, indent=4)

    # Optionally, print the results
    for post in top_posts_list:
        print(post['date'], post['title'])

# Example usage:
# calculate_engagement('../2_months_data/r_fednews_posts.jsonl', 'top_posts.json')


In [5]:

# Specify the path to your JSONL file
input_file_path = "../2_months_data/filtered_posts.jsonl"
output_file_path = "top_posts.json"
# Call the function to calculate engagement and save the results
calculate_engagement(input_file_path, output_file_path)

2025-01-01 Scott Kupor (VC Partner) nominated to Head OPM
2025-01-01 What common anecdotes/situations do you hear about federal employees who you feel stayed working too long? 
2025-01-01 Employment in Europe worth it?
2025-01-02 How do folks get so much use or Lose Annual Leave?
2025-01-02 When to take last 15 min break
2025-01-02 NTEU - It’s Official: Members, DHS Approve New Contract
2025-01-03 BCBS FEP basic plan greed w wegovy… 
2025-01-03 The U.S. Government Agencies with the Highest Paid Employees
2025-01-03 Coworker went off on the boss
2025-01-04 Should I stay civilian or go back to contractor?
2025-01-04 Paycheck smaller than normal 
2025-01-04 How much do you put towards liquid savings each paycheck and what’s your set emergency fund?
2025-01-05 What’s happened to AFGE?They don’t seem to care anymore. What’s the benefit at this point?
2025-01-05 Can we have a NCR Snow Watch Megathread?
2025-01-05 Stop cowing to toxic management
2025-01-06 What’s your grade level, and what’s 

In [6]:
import json

def convert_json_to_jsonl(input_json_file, output_jsonl_file):
    # Load the JSON file into a Python object (list of dictionaries)
    with open(input_json_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    
    # Write each dictionary as a JSON object on a new line in the output JSONL file
    with open(output_jsonl_file, 'w', encoding='utf-8') as outfile:
        for post in data:
            json.dump(post, outfile, ensure_ascii=False)
            outfile.write("\n")  # Write each JSON object on a new line

# Example usage:
convert_json_to_jsonl('top_posts.json', 'top_posts.jsonl')
