In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected.")

CUDA available: True
GPU device name: NVIDIA GeForce RTX 4070


In [5]:
import os
import json
from dataclasses import dataclass, field
from typing import Optional, Dict, Any

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# Ensure torch is available for device selection
import torch

In [7]:
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"

DATA_DIR = ".//dataset"

OUTPUT_DIR = ".//qwen25_resume_lora"


In [7]:
if os.path.isdir(DATA_DIR):
    print("Directory exists")
else:
    print("Directory does not exist")

Directory exists


In [10]:
def answer_question(question: str, model_name: str = MODEL_NAME, max_new_tokens: int = 50, device: Optional[str] = None) -> str:
    """Load model and tokenizer, generate an answer for the given question, and return the response string."""
    # Select device: provided device > CUDA if available > CPU
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Load tokenizer and model (this will download weights the first time)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

    # Build chat-style inputs expected by the tokenizer helper
    messages = [{"role": "user", "content": question}]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)

    # Move tensors to model device — support both dict and plain tensor returns
    if isinstance(inputs, dict):
        inputs = {k: (v.to(model.device) if hasattr(v, 'to') else v) for k, v in inputs.items()}
        input_ids = inputs.get('input_ids')
    else:
        inputs = inputs.to(model.device)
        input_ids = inputs

    # Generate using appropriate call signature
    if isinstance(inputs, dict):
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    else:
        outputs = model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False)

    # If model.generate returned a tuple, take first element
    if isinstance(outputs, tuple):
        outputs = outputs[0]

    # Compute input length for slicing generated tokens
    if input_ids is not None:
        input_len = input_ids.shape[1]
    else:
        input_len = 0

    # Slice out the newly generated tokens and decode
    generated = outputs[0, input_len:] if outputs.dim() == 2 else outputs[input_len:]
    response = tokenizer.decode(generated, skip_special_tokens=True)
    return response

In [12]:
# Example: call the function from another cell
question = "What is the capital of France?"
print(answer_question(question))

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.47it/s]



The capital of France is Paris.


In [28]:
import pandas as pd
df = pd.read_json(r'C:\Users\Abhinav\Documents\Repo\conda\harsha-proj\dataset\resume_text.jsonl', lines=True)
print(df)

                                          filename filetype  \
0                   00_Willie_Ellis_Go_Python.docx    .docx   
1                             10272022 Resume.docx    .docx   
2     14cd13cf-d8db-400e-aa03-6948b7c9c3df.pdf.pdf     .pdf   
3                                1611617001435.pdf     .pdf   
4         1686467115510_BHUSHANABOINA MOUKSHA.docx    .docx   
...                                            ...      ...   
2123                        Younus Nobi resume.pdf     .pdf   
2124                        Zachary Darabaris.docx    .docx   
2125                            ZHAOYANG CHEN.docx    .docx   
2126                  Ziad-Sr Product Manager.docx    .docx   
2127          Zouheir-Sr_Data_Science_NTT_Data.doc     .doc   

                                            resume_text  has_images  
0     Willie Ellis \nSenior Software Engineer\n\nBuf...       False  
1     SUMMARY\n\nLeverage my skills, education, and ...       False  
2     James F. Chams  \nPhone: 1 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2128 entries, 0 to 2127
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   filename     2128 non-null   object
 1   filetype     2128 non-null   object
 2   resume_text  2128 non-null   object
 3   has_images   2128 non-null   bool  
dtypes: bool(1), object(3)
memory usage: 52.1+ KB


In [5]:
print("Before filtering length:", len(df))
df = df[~((df['has_images'] == True) & (df['resume_text'] == ""))]
print("After filtering length:", len(df))

Before filtering length: 2128
After filtering length: 2095


In [6]:
print("Before filtering length:", len(df))
df = df[~((df['has_images'] == True))]
print("After filtering length:", len(df))

Before filtering length: 2095
After filtering length: 1593


In [15]:
import re

# Function to clean text by removing non-ASCII characters and extra whitespace
def clean_text(text):
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to resume_text and filename columns
df['resume_text'] = df['resume_text'].apply(clean_text)
df['filename'] = df['filename'].apply(clean_text)

# Optionally, filter out rows where resume_text is empty or too short after cleaning
df = df[df['resume_text'].str.len() > 10]  # Example: keep rows with more than 10 characters

In [20]:
# Clean the resume_text column by replacing any sequence of whitespace (including newlines, tabs, etc.) with a single space
df['resume_text'] = df['resume_text'].str.replace(r'[^\w\s]', '', regex=True).str.replace(r'\s+', ' ', regex=True)

In [None]:
import os
DATA_DIR = ".//dataset"

# df.to_csv(os.path.join(DATA_DIR, 'resumes.csv'), index=False)
df.to_parquet(os.path.join(DATA_DIR, 'cleaned_resumes.parquet'), engine='pyarrow', compression='snappy')

In [None]:
df_loaded = pd.read_parquet(os.path.join(DATA_DIR, 'cleaned_resumes.parquet'), engine='pyarrow')
df_loaded

Unnamed: 0,filename,filetype,resume_text,has_images
0,00_Willie_Ellis_Go_Python.docx,.docx,Willie Ellis Senior Software Engineer Buffalo ...,False
1,10272022 Resume.docx,.docx,SUMMARY Leverage my skills education and exper...,False
2,14cd13cf-d8db-400e-aa03-6948b7c9c3df.pdf.pdf,.pdf,James F Chams Phone 1 513 289 3847 Email James...,False
3,1611617001435.pdf,.pdf,Anne Chagnon 1707 Fry St 11 Falcon Heights MN ...,False
6,1KshitijaPatel_Resume.docx,.docx,Kshitija Patel Email patelkshitija1302gmailcom...,False
...,...,...,...,...
2122,Yohannes Dabera.docx,.docx,Name Yohannes H Cell 8137667902 Johnnyhailu21g...,False
2123,Younus Nobi resume.pdf,.pdf,Younus Nobi Email younusnobi996gmailcom Cell P...,False
2124,Zachary Darabaris.docx,.docx,Zachary Darabaris Zach graduated from Wabash C...,False
2125,ZHAOYANG CHEN.docx,.docx,ZHAOYANG CHEN 6318385994 zhaoyangchen0813gmail...,False
