In [5]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [6]:
# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sam\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sam\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sam\AppData\Roaming\nltk_data...


True

In [18]:
# Load the resumes and job descriptions datasets from CSV files
resumes_df = pd.read_csv("./data/Resume/Resume.csv")
job_descriptions_df = pd.read_csv("./data/job_descriptions.csv")

In [19]:
# Data Preprocessing
def preprocess_text(text):
    # Remove irrelevant characters, spaces, and formatting
    text = text.replace('\n', ' ').replace('\r', '').strip()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Stop Word Removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in lemmatized_tokens if word.lower() not in stop_words]
    
    return ' '.join(filtered_tokens)

resumes_df['Processed_Resume'] = resumes_df['Resume_str'].apply(preprocess_text)
job_descriptions_df['Processed_Job_Description'] = job_descriptions_df['Job Description'].apply(preprocess_text)


In [21]:
# Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=1000, lowercase=True, stop_words='english')
X_resumes = tfidf_vectorizer.fit_transform(resumes_df['Processed_Resume'])
X_job_descriptions = tfidf_vectorizer.transform(job_descriptions_df['Processed_Job_Description'])