### Goal:
1. Update users table to count distinct #hackathon_projects and #repos that each user contributed to.
2. Process outliers to exclude potential bot users.

In [None]:
# Update table
"""
ALTER TABLE users
ADD COLUMN project_count INT,
ADD COLUMN unique_repo_count INT;
"""

"""
UPDATE users u SET project_count = (
UPDATE users u SET project_count = COALESCE(p.cnt, 0)
FROM (
    SELECT user_id, COUNT(DISTINCT project_id) AS cnt
    FROM user_projects
    GROUP BY user_id
) p
WHERE u.user_id = p.user_id;
"""

"""
UPDATE users u SET unique_repo_count = COALESCE(r.repo_cnt, 0)
FROM (
    SELECT user_id, COUNT(DISTINCT repo) AS repo_cnt
    FROM (
        SELECT user_id, jsonb_array_elements_text(repos) AS repo
        FROM time_windows
    ) x
    GROUP BY user_id
) r
WHERE u.user_id = r.user_id;
"""
# jsonb_array_elements_text(repos) will split each ["a/b", "c/d"] into separate lines;

##### Process outliers to exclude potential bot users.

Method 1: delete potential bots from all tables -- complicated and cannot rollback -- NO!

Method 2: add a field in users table to mark them as bots and skip these users in future works. 

sql: ALTER TABLE users ADD COLUMN is_bot BOOLEAN DEFAULT FALSE;

In [1]:
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, text
from pathlib import Path
import pandas as pd

load_dotenv(dotenv_path=Path.cwd() / ".env")

user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST", "localhost")
port = os.getenv("DB_PORT", "5432")
dbname = os.getenv("DB_NAME")

engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{dbname}")

In [2]:
df = pd.read_sql("SELECT unique_repo_count FROM users", engine)

mean_val = df["unique_repo_count"].mean()
median_val = df["unique_repo_count"].median()
q1 = df["unique_repo_count"].quantile(0.25)
q3 = df["unique_repo_count"].quantile(0.75)
min_val = df["unique_repo_count"].min()
max_val = df["unique_repo_count"].max()
mode_val = df["unique_repo_count"].mode()

print("Numbers of repositires that a user contributed to:")
print(f"Mean: {mean_val}")
print(f"Q1: {q1}")
print(f"Median: {median_val}")
print(f"Q3: {q3}")
print(f"Min: {min_val}")
print(f"Max: {max_val}")
print(f"Mode: {list(mode_val.values)}")


Numbers of repositires that a user contributed to:
Mean: 14.848604293860735
Q1: 2.0
Median: 8.0
Q3: 19.0
Min: 0
Max: 230
Mode: [np.int64(0)]


In [None]:
df = pd.read_sql("SELECT unique_repo_count FROM users", engine)

In [3]:
df = pd.read_sql("SELECT project_count FROM users", engine)

mean_val = df["project_count"].mean()
median_val = df["project_count"].median()
q1 = df["project_count"].quantile(0.25)
q3 = df["project_count"].quantile(0.75)
min_val = df["project_count"].min()
max_val = df["project_count"].max()
mode_val = df["project_count"].mode()

print("Numbers of hackathon projects that a user contributed to:")
print(f"Mean: {mean_val}")
print(f"Q1: {q1}")
print(f"Median: {median_val}")
print(f"Q3: {q3}")
print(f"Min: {min_val}")
print(f"Max: {max_val}")
print(f"Mode: {list(mode_val.values)}")


Numbers of hackathon projects that a user contributed to:
Mean: 1.6778568674726526
Q1: 1.0
Median: 1.0
Q3: 2.0
Min: 0
Max: 87
Mode: [np.int64(1)]
