In [None]:
pip install torch transformers scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
# Import necessary libraries
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch



In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the dataset
df = pd.read_csv('/content/job_postings.csv')

df.head()


Using device: cpu


Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type
0,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite
1,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite
2,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite
3,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite
4,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite


In [None]:
# Combine relevant columns into a single context for each job posting
df['context'] = (
    "Job Title: " + df['job_title'].fillna('N/A') + "\n" +
    "Company: " + df['company'].fillna('N/A') + "\n" +
    "Location: " + df['job_location'].fillna('N/A') + "\n" +
    "Job Level: " + df['job_level'].fillna('N/A') + "\n" +
    "Job Type: " + df['job_type'].fillna('N/A')
)


In [None]:
# Create example question-answer pairs for fine-tuning
qa_data = []

for index, row in df.iterrows():
    context = row['context']
    questions_answers = [
        ("What is the job title?", row['job_title']),
        ("Which company is hiring?", row['company']),
        ("Where is the job located?", row['job_location']),
        ("What is the job level?", row['job_level']),
        ("Is it a remote or onsite job?", row['job_type'])
    ]
    for question, answer in questions_answers:
        qa_data.append({"context": context, "question": question, "answer": answer})

qa_df = pd.DataFrame(qa_data)

In [None]:
# Load the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Prepare input and output texts for training
input_texts = ["question: " + row['question'] + " context: " + row['context'] for _, row in qa_df.iterrows()]
target_texts = [row['answer'] for _, row in qa_df.iterrows()]

In [None]:
# Split the data into training and validation sets
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_texts, target_texts, test_size=0.2)

In [None]:
# Tokenize the inputs and targets
# Convert targets to string and handle NaN values
train_targets = [str(t) if pd.notna(t) else "N/A" for t in train_targets]
val_targets = [str(t) if pd.notna(t) else "N/A" for t in val_targets]

# Tokenize the targets
train_targets_enc = tokenizer(train_targets, padding=True, truncation=True, max_length=128, return_tensors="pt")
val_targets_enc = tokenizer(val_targets, padding=True, truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenizer(train_inputs, padding=True, truncation=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(val_inputs, padding=True, truncation=True, max_length=512, return_tensors="pt")

train_targets_enc = tokenizer(train_targets, padding=True, truncation=True, max_length=128, return_tensors="pt")
val_targets_enc = tokenizer(val_targets, padding=True, truncation=True, max_length=128, return_tensors="pt")


In [None]:
# Create a custom dataset class
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, targets):
        self.encodings = encodings
        self.targets = targets

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.targets.input_ids[idx])
        return item

In [None]:
# Create datasets for training and validation
train_dataset = QADataset(train_encodings, train_targets_enc)
val_dataset = QADataset(val_encodings, val_targets_enc)


In [None]:
# Setup training arguments
training_args = TrainingArguments(
    output_dir='./t5-job-qa',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=200,
    load_best_model_at_end=True,
)



In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
# Start training the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmaheshwarraobandi14[0m ([33mmaheshwarraobandi14-university-of-missouri-kansas-city[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.targets.input_ids[idx])
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


In [None]:
# Save the fine-tuned model
model.save_pretrained('./t5-job-qa-finetuned')
tokenizer.save_pretrained('./t5-job-qa-finetuned')


In [None]:
# Inference: Example questions
example_context = df['context'].iloc[0]
example_question = "What is the job title?"

input_text = f"question: {example_question} context: {example_context}"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
output_ids = model.generate(input_ids)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("\nExample Question:", example_question)
print("Generated Answer:", answer)


In [None]:
!pip install streamlit


In [None]:
%%writefile app.py
import streamlit as st
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
st.sidebar.success(f"Using device: {device}")

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Sample job postings dataset (Replace this with your actual dataset)
data = {
    'job_title': ['Senior Machine Learning Engineer', 'Lead Data Engineer'],
    'company': ['Jobs for Humanity', 'Dice'],
    'job_location': ['New Haven, CT', 'Plano, TX'],
    'job_level': ['Mid senior', 'Mid senior'],
    'job_type': ['Onsite', 'Onsite']
}

df = pd.DataFrame(data)
df['context'] = (
    "Job Title: " + df['job_title'].fillna('N/A') + "\n" +
    "Company: " + df['company'].fillna('N/A') + "\n" +
    "Location: " + df['job_location'].fillna('N/A') + "\n" +
    "Job Level: " + df['job_level'].fillna('N/A') + "\n" +
    "Job Type: " + df['job_type'].fillna('N/A')
)

# Streamlit UI
st.title("Job Posting Question Answering System")
st.write("Ask questions about job titles, companies, locations, job levels, and job types!")

# Select a job posting
job_index = st.selectbox("Select a Job Posting:", df.index)
selected_context = df['context'].iloc[job_index]
st.subheader("Selected Job Posting Details")
st.text(selected_context)

# Input question from user
user_question = st.text_input("Enter your question:")
generate_button = st.button("Generate Answer")

# Generate and display answer
if generate_button and user_question:
    with st.spinner("Generating answer..."):
        input_text = f"question: {user_question} context: {selected_context}"
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
        output_ids = model.generate(input_ids)
        answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        st.subheader("Generated Answer")
        st.success(answer)


In [None]:
!pip install pyngrok


In [None]:
!ngrok config add-authtoken 2teSznNUMjFirRLWS43CdupFAiv_4dC43Rp8pB18P2vLej4wJ

In [None]:
from pyngrok import ngrok

# Set up a tunnel to the Streamlit app
from pyngrok import ngrok

# Correct ngrok connection setup
public_url = ngrok.connect('8501', "http")
print(f"Streamlit app is live at: {public_url}")


In [None]:
!streamlit run app.py &>/dev/null &


In [None]:
!streamlit run app.py --server.address=0.0.0.0 --server.port=8502 &>/dev/null &
public_url = ngrok.connect('8502', "http")
print(f"Streamlit app is live at: {public_url}")
