### Goal:

Create and populate tables of users, hackathon projects, and their mapping.

In [1]:
import pandas as pd

users_df = pd.read_csv("../data/humans_hash_complete.csv")
projects_df = pd.read_csv("../data/proj_hack_human_complete.csv")


### Create User Table

In [2]:
users_cleaned = (
    users_df[~users_df['name'].str.contains('bot', case=False, na=False)] # exclude 'bot' names
    [["name", "hash"]]
    .rename(columns={"name": "user_id"})
)



In [3]:
users_cleaned

Unnamed: 0,user_id,hash
0,kylaf,4486c81cdf7622ce8750b9bb95f888b2d27f4d9a615e2e...
1,sophiapeckner,9e8fd2f4fff41ab11b6dd9d1f5e9a024ff90c10b87f2ca...
2,LaExploradora,00a2afb4851347f79e3e7f4b26977208e219ab45b61260...
3,ashlaycyriac,2e804c25f0c261700e98f9beecfd861ae6fc50214655a6...
4,ncitron,04d4b160c02b1fa01349a43e33f7833ab3769acc560abb...
...,...,...
173778,mastrolinux,1ac69644ce7bf739e9a1f4b3d81e5a723bce6f77d2272d...
173779,bodokaiser,749d37b9dbd8e3761cc6fddb36b85c249547ca1351f0d9...
173780,Omig12,bd512f90412930f470774695128e5c090d79d9411f1646...
173781,craig-ludington,918452be521c634f2fef412a5595e64d755bb1c99358ef...


### Create project table

In [4]:
from ast import literal_eval

projects_cleaned = projects_df[["project_URL", "github_links", "start_date_format", "end_date_format"]].dropna()
projects_cleaned = projects_cleaned.rename(columns={
    "project_URL": "project_url",
    "github_links": "repo_links",
    "start_date_format": "start_date",
    "end_date_format": "end_date",
})

# To array
projects_cleaned["repo_links"] = projects_cleaned["repo_links"].apply(
    lambda x: literal_eval(x) if isinstance(x, str) and x.startswith("[") else [x]
)

projects_cleaned["start_date"] = pd.to_datetime(projects_cleaned["start_date"])
projects_cleaned["end_date"] = pd.to_datetime(projects_cleaned["end_date"])

duplicates = projects_cleaned['project_url'].duplicated().sum()
projects_cleaned = projects_cleaned.drop_duplicates(subset=['project_url'])


In [5]:
projects_cleaned

Unnamed: 0,project_url,repo_links,start_date,end_date
0,https://devpost.com/software/faefolk,"[https://github.com/ICCards/faefolk, https://g...",2022-05-10,2022-06-22
1,https://devpost.com/software/tingram,[https://github.com/tingramtingram/dfinity],2022-05-10,2022-06-22
2,https://devpost.com/software/ant-kingdom,[https://github.com/NFPTU/dfinity-fu],2022-05-10,2022-06-22
3,https://devpost.com/software/meta-yield-liquid...,[https://github.com/Narwallets/meta-yield-ic],2022-05-10,2022-06-22
4,https://devpost.com/software/4everland,[https://github.com/4everland/dashboard-websit...,2022-05-10,2022-06-22
...,...,...,...,...
36308,https://devpost.com/software/ruby-programming-...,[https://github.com/shrejal99/Ruby-Programmin-...,2021-10-13,2021-10-14
36309,https://devpost.com/software/social-distancing...,[https://github.com/DevrajDC/Social-Distance-a...,2021-10-08,2021-10-10
36310,https://devpost.com/software/suit-yourself,"[https://github.com/LilyPerr/hackscios, https:...",2019-04-13,2019-04-14
36312,https://devpost.com/software/track-your-learni...,[https://github.com/ektaarora16/Track-your-Lea...,2021-10-12,2021-10-13


### Create user-project table

In [6]:
def extract_contributors(row):
    if isinstance(row, str):
        return [x.strip() for x in row.split(",")]
    return []

user_projects_records = []

for _, row in projects_df.iterrows():
    project_url = row["project_URL"]
    contributors = extract_contributors(row["contributor_github_username"])
    for user in contributors:
        if user:
            user_projects_records.append((user, project_url))

user_projects_df = pd.DataFrame(user_projects_records, columns=["user_id", "project_url"])


In [7]:
user_projects_df

Unnamed: 0,user_id,project_url
0,ALLiDoizCode,https://devpost.com/software/faefolk
1,maxwisch,https://devpost.com/software/faefolk
2,stoma655,https://devpost.com/software/tingram
3,tingramtingram,https://devpost.com/software/tingram
4,DatTNT,https://devpost.com/software/ant-kingdom
...,...,...
308367,pappas999,https://devpost.com/software/tsunami-fler8g
308368,PatrickAlphaC,https://devpost.com/software/tsunami-fler8g
308369,cryptohighway,https://devpost.com/software/tsunami-fler8g
308370,dwightjl,https://devpost.com/software/tsunami-fler8g


In [8]:
import io

def df_to_pg_copy_buffer(df: pd.DataFrame, sep="\t") -> io.StringIO:
    buffer = io.StringIO()
    df.to_csv(buffer, index=False, header=False, sep=sep, na_rep="\\N", quoting=3)  # quoting=3 means quote none
    buffer.seek(0)
    return buffer


Create these table in database (for the first time)

In [9]:
import psycopg2
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, text
from pathlib import Path
import pandas as pd


load_dotenv(dotenv_path=Path.cwd() / ".env")

user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST", "localhost")
port = os.getenv("DB_PORT", "5432")
dbname = os.getenv("DB_NAME")

conn = psycopg2.connect(
    dbname=dbname, 
    user=user, 
    password=password, 
    host=host, 
    port=port           
)
cursor = conn.cursor()

create_users_sql = """
CREATE TABLE IF NOT EXISTS users (
    user_id TEXT PRIMARY KEY,
    hash TEXT
);
"""

create_projects_sql = """
CREATE TABLE IF NOT EXISTS projects (
    project_id SERIAL PRIMARY KEY,
    project_url TEXT UNIQUE,
    repo_links TEXT[],
    start_date TIMESTAMPTZ,
    end_date TIMESTAMPTZ
);
"""

create_user_projects_sql = """
CREATE TABLE IF NOT EXISTS user_projects (
    user_project_id SERIAL PRIMARY KEY,
    user_id TEXT REFERENCES users(user_id),
    project_id INT REFERENCES projects(project_id)
);
"""

cursor.execute(create_users_sql)
cursor.execute(create_projects_sql)
cursor.execute(create_user_projects_sql)
conn.commit()


In [10]:
import psycopg2
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, text
from pathlib import Path
import pandas as pd


load_dotenv(dotenv_path=Path.cwd() / ".env")

user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST", "localhost")
port = os.getenv("DB_PORT", "5432")
dbname = os.getenv("DB_NAME")

conn = psycopg2.connect(
    dbname=dbname, 
    user=user, 
    password=password, 
    host=host, 
    port=port           
)
cursor = conn.cursor()

# Truncate to avoid duplication after copy
cursor.execute("TRUNCATE TABLE user_projects CASCADE;")
cursor.execute("TRUNCATE TABLE projects CASCADE;")
cursor.execute("TRUNCATE TABLE users CASCADE;")
conn.commit()


# --- users ---
user_buf = df_to_pg_copy_buffer(users_cleaned[["user_id", "hash"]])
cursor.copy_from(user_buf, 'users', columns=('user_id', 'hash'), sep='\t')

# --- projects ---
projects_cleaned["repo_links"] = projects_cleaned["repo_links"].apply(
    lambda x: "{" + ",".join(x) + "}" if isinstance(x, list) else "{}"
) # convert text array  "[]" to {,,,}
project_buf = df_to_pg_copy_buffer(projects_cleaned[["project_url", "repo_links", "start_date", "end_date"]])
cursor.copy_from(project_buf, 'projects', columns=('project_url', 'repo_links', 'start_date', 'end_date'), sep='\t')
conn.commit()

# get mapping of project id and url
cursor.execute("SELECT project_id, project_url FROM projects;")
rows = cursor.fetchall()
project_map = {url: pid for pid, url in rows}

# --- user_projects ---
user_projects_df['project_id'] = user_projects_df['project_url'].map(project_map) # map urls to ids
user_projects_df = user_projects_df.dropna(subset=['project_id']) # filter out lines not match

user_projects_df['project_id'] = user_projects_df['project_id'].astype(int)

user_project_buf = df_to_pg_copy_buffer(user_projects_df[['user_id', 'project_id']])
cursor.copy_from(user_project_buf, 'user_projects', columns=('user_id', 'project_id'), sep='\t'
                 )
conn.commit()

cursor.close()
conn.close()


In [13]:
import pandas as pd

df = pd.read_csv("E:/hackathon/hackathon/data/hackathons.csv")
df

Unnamed: 0.1,Unnamed: 0,URL,Criteria,schedule,hack_type,info,start_date_format,end_date_format,Prizes,prize_money,...,end_date,year,themes,prize,registered_N,featured,organization_name,winners_announced,submission_gallery_url,start_a_submission_url
0,0,https://wirvsvirushackathon.devpost.com/,"Gesellschaftlicher Mehrwert, Innovationsgrad, ...",no schedule,Public,"Mar 29, 2020, Online, Public, Social Good ...",2020-03-20,2020-03-22,"Euer Preis ist Anerkennung und Ehre!, Habt Spa...",no money prize,...,22,2020,"Social Good, COVID-19",$0,12505,False,,False,https://wirvsvirushackathon.devpost.com/projec...,https://wirvsvirushackathon.devpost.com/challe...
1,4,https://theglobalhack.devpost.com/,"Potential impact of the project, Technical exe...",no schedule,Public,"Apr 12, 2020, Online, Public, COVID-19",2020-04-03,2020-04-09,Overall Winner,no money prize,...,09,2020,COVID-19,€0,6059,False,,False,https://theglobalhack.devpost.com/project-gallery,https://theglobalhack.devpost.com/challenges/s...
2,8,https://supernova.devpost.com/,"X Factor and Potential, Technical Competence, ...",no schedule,Public,"Jul 11, 2022, Online, Public, DFINITY Foundati...",2022-05-10,2022-06-22,"SocialFi - 1ST PLACE, SocialFi - 2ND PLACE, So...","$765,000",...,Jun 22,2022,"Social Good, Blockchain, Open Ended",$0,3633,False,DFINITY Foundation,False,https://supernova.devpost.com/project-gallery,https://supernova.devpost.com/challenges/start...
3,12,https://hajjhackathon.devpost.com/,"Design | التصميم, Simpli...",no schedule,Public,"Aug 3, 2018, Jeddah International Exhibition &...",2018-08-01,2018-08-03,1st place: 1 million Saudi Riyal (15% equity i...,"SAR2,000,000",...,03,2018,,SAR0,2935,False,,False,https://hajjhackathon.devpost.com/project-gallery,https://hajjhackathon.devpost.com/challenges/s...
4,16,https://chainlinkspring2022.devpost.com/,"User Experience / User Interface, Originality ...",no schedule,Public,"Jun 8, 2022, Online, Public, Chainlink Labs, B...",2022-04-22,2022-05-28,"Chainlink Grand Prize - $40,000, Chainlink NFT...","$1,019,000",...,May 28,2022,"Blockchain, Web, Fintech",$0,1481,False,Chainlink Labs,False,https://chainlinkspring2022.devpost.com/projec...,https://chainlinkspring2022.devpost.com/challe...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7048,7075,https://checkinproject.devpost.com/,biden,no schedule,Public,"Aug 30, 2021, Thái Lan, Public, ACM Monterrey,...",2021-08-30,2021-08-30,cash value,"$1,000",...,Aug 30,2021,Voice skills,$0,1,False,ACM Monterrey,True,https://checkinproject.devpost.com/project-gal...,https://checkinproject.devpost.com/challenges/...
7049,7076,https://fgtfrgf.devpost.com/,"Criteria 1, Criteria 1, criteria",no schedule,Public,"Jul 15, 2021, Online, Public, Dong Trinh LTD, ...",2021-07-15,2021-07-15,"1st place, 2nd place, 3rd place, 4nd","$1,100",...,Jul 15,2021,"AR/VR, Machine Learning/AI, Open Ended",$0,1,False,Dong Trinh LTD,True,https://fgtfrgf.devpost.com/project-gallery,https://fgtfrgf.devpost.com/challenges/start_a...
7050,7077,https://old-hacks.devpost.com/,TBA,no schedule,Public,"May 7, 2021, Quận 2, Vietnam, Public, Hackers ...",2021-05-07,2021-05-07,TBA,no money prize,...,May 07,2021,Open Ended,$0,1,False,Hackers Galore,True,https://old-hacks.devpost.com/project-gallery,https://old-hacks.devpost.com/challenges/start...
7051,7078,https://thailand-champion-13627.devpost.com/,biden,no schedule,Public,"Sep 15, 2021, Thái Lan, Public, Screen Compose...",2021-09-15,2021-09-15,Thailand cash,$100,...,Sep 15,2021,Voice skills,$0,0,False,Screen Composers Guild of Canada & SOCAN,True,https://thailand-champion-13627.devpost.com/pr...,https://thailand-champion-13627.devpost.com/ch...


In [9]:
df.columns

Index(['URL', 'Criteria', 'schedule', 'hack_type', 'info', 'start_date_format',
       'end_date_format', 'Prizes', 'prize_money', 'Title', 'Location', 'year',
       'themes', 'prize', 'featured', 'organization_name', 'winners_announced',
       'submission_gallery_url', 'start_a_submission_url'],
      dtype='object')

In [None]:
"""
DROP TABLE IF EXISTS hackathons;

CREATE TABLE hackathons (
    id SERIAL PRIMARY KEY,        
    url TEXT,
    hack_type TEXT,
    start_date_format DATE,
    end_date_format DATE,
    prizes TEXT,
    prize_money TEXT,
    location TEXT
);

"""

""" 
\COPY hackathons(url,criteria,schedule,hack_type,info,start_date_format,end_date_format,prizes,prize_money,title,location,year,themes,prize,featured,organization_name,winners_announced,submission_gallery_url,start_a_submission_url) 
FROM 'E:/hackathon/hackathon/data/hackathons_clean_utf8.csv' DELIMITER ',' CSV HEADER;

"""

In [12]:
import psycopg2
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, text
from pathlib import Path
import pandas as pd

df = pd.read_csv("E:\hackathon\hackathon\data\hackathons.csv")
columns_to_keep = ['URL', 'hack_type', 'start_date_format', 'end_date_format', 'Prizes', 'prize_money', 'Location']
df = df[columns_to_keep]

load_dotenv(dotenv_path=Path.cwd() / ".env")

user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST", "localhost")
port = os.getenv("DB_PORT", "5432")
dbname = os.getenv("DB_NAME")

conn = psycopg2.connect(
    dbname=dbname, 
    user=user, 
    password=password, 
    host=host, 
    port=port           
)
cur = conn.cursor()

cur.execute("""
DROP TABLE IF EXISTS hackathons;
CREATE TABLE hackathons (
    id SERIAL PRIMARY KEY,
    url TEXT,
    hack_type TEXT,
    start_date_format DATE,
    end_date_format DATE,
    prizes TEXT,
    prize_money TEXT,
    location TEXT
)
""")
conn.commit()

for _, row in df.iterrows():
    cur.execute(
        """
        INSERT INTO hackathons (url, hack_type, start_date_format, end_date_format, prizes, prize_money, location)
        VALUES (%s,%s,%s,%s,%s,%s,%s)
        """,
        tuple(row)
    )

conn.commit()
cur.close()
conn.close()
