In [None]:
!pip install sqlalchemy

In [None]:
!pip install tomli

In [None]:
!pip install psycopg2-binary

In [None]:
!pip install sentence-transformers umap-learn

In [None]:
import sys
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.data.loader import get_db_engine
from src.config import load_config

# Load config
cfg = load_config()
jobs_cfg = cfg['jobs']
table = jobs_cfg['table_name'] #'job_postings"

# Load CSV 
df = pd.read_csv('C:\\Users\\hisuk\\labor-market-nlp-prototype\\data\\raw\\companies_merged.csv')

print("Rows:", df.shape[0])
print(df.columns)

# Save to PostgreSQL
engine = get_db_engine()
df.to_sql(table, engine, if_exists='replace', index=False)

print("DB Load Complete!")


Cell 1 – imports & config

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys



# project root = parent of the notebooks folder
PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.data.loader import load_jobs


plt.rcParams["figure.figsize"] = (10, 5)
sns.set()


Cell 2 – load a sample of jobs

In [None]:
from src.data.loader import load_jobs

jobs = load_jobs(limit=2000)
jobs.head()


Cell 3 – basic info

In [None]:
jobs.info()
jobs.describe(include="all").transpose().head(20)

Cell 4 – text length distribution

In [None]:
text_col = "description"  # adjust if needed

jobs["text_len"] = jobs[text_col].astype(str).str.len()

jobs["text_len"].describe()
sns.histplot(jobs["text_len"], bins=50)
plt.title("Distribution of job description length")
plt.show()


Cell 5 – top job titles / locations

In [None]:
if "title" in jobs.columns:
    display(jobs["title"].value_counts().head(20))

if "city" in jobs.columns:
    display(jobs["city"].value_counts().head(20))


Cell 6 – quick embedding + scatter

If you want a very simple demo for the client:

In [None]:
from sentence_transformers import SentenceTransformer
import umap

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# pick a text column name you actually have, e.g. "description"
text_col = "description"  

n = min(len(jobs), 2000)
sample = jobs.sample(n, random_state=42)

emb = model.encode(sample[text_col].tolist(), show_progress_bar=True)

reducer = umap.UMAP(random_state=42)
emb_2d = reducer.fit_transform(emb)

sample["x"] = emb_2d[:, 0]
sample["y"] = emb_2d[:, 1]

sns.scatterplot(
    data=sample,
    x="x",
    y="y",
    alpha=0.5,
    s=10,
)
plt.title("UMAP projection of job description embeddings")
plt.show()

