In [None]:
!pip install -U sentence-transformers datasets huggingface_hub




In [None]:
# train.py

import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# ========= Paths =========
RESUME_CSV = "/content/Resume.csv"              # update path if different
JOB_CSV = "/content/job_descriptions.csv"

# ========= Load Datasets (robust) =========
print("📂 Loading datasets...")
resumes_df = pd.read_csv(RESUME_CSV, engine="python", encoding="utf-8", on_bad_lines="skip")
jobs_df = pd.read_csv(JOB_CSV, engine="python", encoding="utf-8", on_bad_lines="skip")

print(f"✅ Resumes Loaded: {len(resumes_df)} | Jobs Loaded: {len(jobs_df)}")

# ========= Preprocessing =========
def clean_text(text):
    """Basic cleanup for resumes and job descriptions"""
    if not isinstance(text, str):
        return ""
    return text.lower().replace("\n", " ").replace("\r", " ").strip()

# Handle Resume column (dataset sometimes has Resume_str / Resume / Resume_html)
if "Resume_str" in resumes_df.columns:
    resumes_df["cleaned"] = resumes_df["Resume_str"].apply(clean_text)
elif "Resume" in resumes_df.columns:
    resumes_df["cleaned"] = resumes_df["Resume"].apply(clean_text)
elif "Resume_html" in resumes_df.columns:
    resumes_df["cleaned"] = resumes_df["Resume_html"].apply(clean_text)
else:
    raise ValueError("❌ Resume CSV must have a 'Resume_str' or 'Resume' column")

# Handle Job column
if "Job Description" in jobs_df.columns:
    jobs_df["cleaned"] = jobs_df["Job Description"].apply(clean_text)
elif "description" in jobs_df.columns:
    jobs_df["cleaned"] = jobs_df["description"].apply(clean_text)
else:
    raise ValueError("❌ Job CSV must have a 'Job Description' column")

# ========= Embeddings =========
print("⚡ Loading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")

print("📝 Encoding resumes...")
resume_embeddings = model.encode(resumes_df["cleaned"].tolist(), show_progress_bar=True)

print("📝 Encoding jobs...")
job_embeddings = model.encode(jobs_df["cleaned"].tolist(), show_progress_bar=True)

# ========= Save Locally =========
print("💾 Saving embeddings and datasets...")
os.makedirs("models", exist_ok=True)

resumes_df.to_csv("models/resumes.csv", index=False)
jobs_df.to_csv("models/jobs.csv", index=False)
np.save("models/resume_embeddings.npy", resume_embeddings)
np.save("models/job_embeddings.npy", job_embeddings)
model.save("models/embedding_model")

print("✅ Training complete! Models and embeddings saved at ./models")


📂 Loading datasets...
✅ Resumes Loaded: 2484 | Jobs Loaded: 137976
⚡ Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

📝 Encoding resumes...


Batches:   0%|          | 0/78 [00:00<?, ?it/s]

📝 Encoding jobs...


Batches:   0%|          | 0/4312 [00:00<?, ?it/s]

💾 Saving embeddings and datasets...
✅ Training complete! Models and embeddings saved at ./models


In [None]:
from huggingface_hub import login
login()   # paste your HF token here


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
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

In [None]:
from sentence_transformers import SentenceTransformer

# Suppose your trained model is called `model`
model.save("./resume-matcher-model")


In [None]:

# Push to Hugging Face Hub (skip create_repo since repo already exists)
model.push_to_hub("maham234/resume-matcher-model")


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmp5cf_0v4u/model.safetensors    :  55%|#####5    | 50.2MB / 90.9MB            

'https://huggingface.co/maham234/resume-matcher-model/commit/fad0edcbf495ea0d651eb707d0a0d9794c62f43a'