In [None]:
import pandas as pd
import os
import pandas as pd
pd.options.mode.copy_on_write = True


In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv('naukri_com-job_sample.csv')
df.head()

In [None]:
def load_and_clean_data(filepath):
    df = pd.read_csv(filepath)
    df = df[['jobtitle', 'jobdescription', 'skills', 'education']]
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


In [None]:
load_and_clean_data("naukri_com-job_sample.csv")

In [None]:
import pandas as pd
import numpy as np
import pickle
import nltk
import re
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# 🔃 Download required NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
df[['jobtitle', 'jobdescription', 'skills', 'education']] = df[['jobtitle', 'jobdescription', 'skills', 'education']].fillna('')


In [None]:
import nltk

# Download to a custom directory
nltk.download('punkt', download_dir='/content/nltk_data')
nltk.data.path.append('/content/nltk_data')  # Add path to NLTK search list


In [None]:
import nltk

nltk.download('punkt')                    # Sentence + word tokenizer
nltk.download('stopwords')               # Common stopwords
nltk.download('wordnet')                 # Lemmatizer base
nltk.download('averaged_perceptron_tagger')  # For POS tagging if needed


In [None]:
# prompt: I want to do NLP operation do Preprocess and Vectorize

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('punkt_tab')


# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to relevant columns
df['processed_jobtitle'] = df['jobtitle'].apply(preprocess_text)
df['processed_jobdescription'] = df['jobdescription'].apply(preprocess_text)
df['processed_skills'] = df['skills'].apply(preprocess_text)
df['processed_education'] = df['education'].apply(preprocess_text)

# Combine processed text for vectorization
df['combined_text'] = df['processed_jobtitle'] + ' ' + df['processed_jobdescription'] + ' ' + df['processed_skills'] + ' ' + df['processed_education']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # You can adjust max_features

# Fit and transform the combined text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

print("Original DataFrame head:")
print(df.head())
print("\nProcessed DataFrame head (with new columns):")
print(df[['processed_jobtitle', 'processed_jobdescription', 'processed_skills', 'processed_education', 'combined_text']].head())
print("\nShape of TF-IDF matrix:")
tfidf_matrix.shape

In [None]:
# Load NLTK resources only once per environment
import nltk

resources = {
    'stopwords': 'corpora/stopwords',
    'punkt': 'tokenizers/punkt',
    'wordnet': 'corpora/wordnet',
    'averaged_perceptron_tagger': 'taggers/averaged_perceptron_tagger',
    'omw-1.4': 'corpora/omw-1.4'
}

for name, path in resources.items():
    try:
        nltk.data.find(path)
    except LookupError:
        nltk.download(name)


In [None]:
# Save the processed DataFrame
df.to_csv('processed_job_data.csv', index=False)

# Save the fitted TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

print("\nProcessed data and vectorizer saved.")

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 🔃 Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 📦 Load processed job data
df_loaded = pd.read_csv('processed_job_data.csv')

# 🧠 Load pre-trained BERT model (SBERT variant)
bert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and fast

# 🧹 Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# 🔄 Precompute BERT embeddings for job descriptions
df_loaded['cleaned_text'] = df_loaded['combined_text'].apply(preprocess_text)
job_embeddings = bert_model.encode(df_loaded['cleaned_text'].tolist(), convert_to_tensor=True)

def recommend_jobs(user_profile_input):
    processed_user_input = preprocess_text(user_profile_input)
    user_embedding = bert_model.encode([processed_user_input], convert_to_tensor=True)

    # 🎯 Calculate cosine similarity with all job embeddings
    similarity_scores = cosine_similarity(user_embedding.cpu(), job_embeddings.cpu())[0]
    top_job_indices = np.argsort(similarity_scores)[::-1][:3]

    # 📢 Display recommendations
    print("\n🔍 Top 3 Job Recommendations:")
    for i, idx in enumerate(top_job_indices):
        similarity_percent = round(similarity_scores[idx] * 100, 2)
        print(f"\n🔹 Recommendation {i+1} (Match: {similarity_percent}%):")
        print("Job Title:", df_loaded.loc[idx, 'jobtitle'])
        print("Job Description:", df_loaded.loc[idx, 'jobdescription'][:400] + '...')
        print("Skills:", df_loaded.loc[idx, 'skills'])
        print("Education:", df_loaded.loc[idx, 'education'])

In [None]:
recommend_jobs("I am a data analyst skilled in Python, SQL, and machine learning")

In [None]:
import pickle

# 💾 Save job embeddings and DataFrame
with open('job_embeddings.pkl', 'wb') as f:
    pickle.dump(job_embeddings, f)

df_loaded.to_pickle('job_data_with_cleaned_text.pkl')


In [None]:
from IPython.display import FileLink

# Download links (works in Kaggle and Colab)
FileLink('job_embeddings.pkl')  # Click to download
FileLink('job_data_with_cleaned_text.pkl')  # Click to download


In [None]:
import pickle

# Save job embeddings
with open('job_embeddings.pkl', 'wb') as f:
    pickle.dump(job_embeddings, f)

# Save the dataframe
df_loaded.to_csv('processed_job_data.csv', index=False)


In [None]:
# prompt: how to save and import this model so I can use this in my web directly

# Save the processed DataFrame
df.to_csv('processed_job_data.csv', index=False)

# Save the fitted TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

print("\nProcessed data and vectorizer saved.")


In [None]:
import pickle

# Save the BERT model
with open('bert_model_new.pkl', 'wb') as f:
    pickle.dump(bert_model, f)

print("BERT model saved successfully as 'bert_new_model.pkl'")

In [None]:
from google.colab import files

files.download('bert_model_new.pkl')