In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.sparse import hstack
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
# Load datasets
resumes = pd.read_csv('/content/UpdatedResumeDataSet.csv')
job_descriptions = pd.read_csv('/content/jd.csv')

# Merge datasets on 'Job Id'
data = pd.merge(resumes, job_descriptions, left_on='Category', right_on='Job Title')

In [5]:
data.head()

Unnamed: 0,Category,Resume,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,Mechanical Engineer,Education Details \r\nMay 1999 to September 20...,1914121205954290,3 to 15 Years,MCA,$65K-$122K,Kingston,Jamaica,18.1096,-77.2975,...,200-851-9382,Mechanical Engineer,Mechanical Design Engineer,Stack Overflow Jobs,Mechanical Design Engineers create and develop...,"{'Tuition Reimbursement, Stock Options or Equi...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",ThyssenKrupp AG,"{""Sector"":""Manufacturing"",""Industry"":""Industri..."
1,Mechanical Engineer,Education Details \r\nMay 1999 to September 20...,2023971849029140,2 to 12 Years,B.Tech,$58K-$121K,Vaduz,Liechtenstein,47.166,9.5554,...,+1-941-947-1714x563,Mechanical Engineer,Mechanical Design Engineer,LinkedIn,Mechanical Design Engineers create and develop...,"{'Tuition Reimbursement, Stock Options or Equi...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",Shriram Transport Finance Company,"{""Sector"":""Financial Services"",""Industry"":""Fin..."
2,Mechanical Engineer,Education Details \r\nMay 1999 to September 20...,1146610732516600,1 to 8 Years,PhD,$60K-$101K,Brazzaville,Republic Of Congo,-0.228,15.8277,...,+1-926-881-3073x64674,Mechanical Engineer,Manufacturing Engineer,FlexJobs,Manufacturing Engineers optimize manufacturing...,"{'Life and Disability Insurance, Stock Options...",Manufacturing processes knowledge CAD/CAM soft...,Improve manufacturing processes and production...,Downer Group,"{""Sector"":""Construction/Infrastructure"",""Indus..."
3,Mechanical Engineer,Education Details \r\nMay 1999 to September 20...,521701516123769,3 to 11 Years,BCA,$56K-$119K,Kabul,Afghanistan,33.9391,67.71,...,(559)709-8417x77725,Mechanical Engineer,Mechanical Design Engineer,The Muse,Mechanical Design Engineers create and develop...,"{'Casual Dress Code, Social and Recreational A...","Mechanical engineering CAD software (e.g., Sol...","Design mechanical systems, components, and pro...",Continental AG,"{""Sector"":""Automotive"",""Industry"":""Automotive""..."
4,Mechanical Engineer,Education Details \r\nMay 1999 to September 20...,118698764050766,4 to 8 Years,M.Com,$62K-$99K,Torshavn,Faroe Islands,61.8926,-6.9118,...,840.565.9663x37863,Mechanical Engineer,Manufacturing Engineer,The Muse,Manufacturing Engineers optimize manufacturing...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Manufacturing processes knowledge CAD/CAM soft...,Improve manufacturing processes and production...,China COSCO Shipping Corporation,"{""Sector"":""Shipping and Logistics"",""Industry"":..."


3. Data preprocessing

In [14]:
def clean_text(text):
    """Cleans text by removing special characters, stopwords, and applying lemmatization."""
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))  # Remove special characters
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize
    return ' '.join(tokens)

# Clean the relevant text fields
data['cleaned_resume'] = data['Resume'].apply(clean_text)
data['cleaned_jd'] = data['Job Description'].apply(clean_text)


In [15]:
data['cleaned_resume']

Unnamed: 0,cleaned_resume
0,education detail may 1999 september 2002 diplo...
1,education detail may 1999 september 2002 diplo...
2,education detail may 1999 september 2002 diplo...
3,education detail may 1999 september 2002 diplo...
4,education detail may 1999 september 2002 diplo...
...,...
7219,education detail august 2000 electronics pune ...
7220,education detail august 2000 electronics pune ...
7221,education detail august 2000 electronics pune ...
7222,education detail august 2000 electronics pune ...


4. Feature Engineering
Jaccard Similarity

In [16]:
def jaccard_similarity(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0


Keyword Match Score

In [17]:
def keyword_match_score(resume, jd):
    resume_set = set(resume.split())
    jd_set = set(jd.split())
    return len(resume_set.intersection(jd_set))


Generate Feature Columns

In [18]:
# Word counts
data['resume_word_count'] = data['cleaned_resume'].apply(lambda x: len(x.split()))
data['jd_word_count'] = data['cleaned_jd'].apply(lambda x: len(x.split()))

# Jaccard similarity and keyword match score
data['jaccard_score'] = data.apply(lambda row: jaccard_similarity(row['cleaned_resume'], row['cleaned_jd']), axis=1)
data['keyword_match'] = data.apply(lambda row: keyword_match_score(row['cleaned_resume'], row['cleaned_jd']), axis=1)


In [19]:
data['jaccard_score']

Unnamed: 0,jaccard_score
0,0.013575
1,0.013575
2,0.015909
3,0.013575
4,0.015909
...,...
7219,0.009950
7220,0.014950
7221,0.009950
7222,0.009950


In [20]:
data['keyword_match']

Unnamed: 0,keyword_match
0,6
1,6
2,7
3,6
4,7
...,...
7219,6
7220,9
7221,6
7222,6


5.TF-IDF Vectorization and Cosine Similarity

In [22]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000)

# Fit and transform on resume and job description
resume_tfidf = tfidf.fit_transform(data['cleaned_resume'])
jd_tfidf = tfidf.transform(data['cleaned_jd'])

# Compute cosine similarity
cosine_sim = [cosine_similarity(resume_tfidf[i], jd_tfidf[i])[0][0] for i in range(len(data))]
data['cosine_similarity'] = cosine_sim


In [23]:
data['cosine_similarity']

Unnamed: 0,cosine_similarity
0,0.143751
1,0.143751
2,0.206549
3,0.143751
4,0.206549
...,...
7219,0.048791
7220,0.033168
7221,0.048791
7222,0.048791


6. Combine Features

In [24]:
# Combine features into a matrix
X_features = np.column_stack((
    data['resume_word_count'],
    data['jd_word_count'],
    data['jaccard_score'],
    data['keyword_match'],
    data['cosine_similarity']
))

# Combine TF-IDF vectors with numerical features
X_tfidf = hstack([resume_tfidf, jd_tfidf])
X = hstack([X_tfidf, X_features])

# Define the target variable
y = data['Preference'].apply(lambda x: 1 if x.lower() == 'female' else 0)  # Example: converting preference to binary target


7. Train-Test Split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


8. Train Linear Regression Model

In [27]:
model = LinearRegression()
model.fit(X_train, y_train)


9. Evaluate the Model

In [28]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae:.2f}')
print(f'R-squared Score: {r2:.2f}')


Mean Absolute Error: 0.43
R-squared Score: 0.05


10. Predict ATS Score for New Resume-JD Pair

In [29]:
def predict_ats_score(resume_text, jd_text):
    """Predicts ATS score for new resume and job description."""
    # Clean input texts
    cleaned_resume = clean_text(resume_text)
    cleaned_jd = clean_text(jd_text)

    # Feature engineering
    resume_wc = len(cleaned_resume.split())
    jd_wc = len(cleaned_jd.split())
    jaccard = jaccard_similarity(cleaned_resume, cleaned_jd)
    keyword_match = keyword_match_score(cleaned_resume, cleaned_jd)

    # TF-IDF vectorization
    resume_vec = tfidf.transform([cleaned_resume])
    jd_vec = tfidf.transform([cleaned_jd])

    # Cosine similarity
    cos_sim = cosine_similarity(resume_vec, jd_vec)[0][0]

    # Combine all features
    features = np.array([resume_wc, jd_wc, jaccard, keyword_match, cos_sim]).reshape(1, -1)
    combined_vec = hstack([resume_vec, jd_vec, features])

    # Predict ATS score
    score = model.predict(combined_vec)[0]
    return score


11. Example Prediction

In [31]:
new_resume = """
YOKESHWAR BOOPATHY
Mississauga, ON | 905-781-3194 | yokeshwarboopathy@gmail.com

OVERVIEW
A software developer and data analyst with 3 years of experience in Python API development, data modeling, and automation, seeking an opportunity to contribute my coding expertise to train AI models. Adept at writing, evaluating, and troubleshooting code across various platforms. Passionate about AI, I am eager to collaborate with innovative teams and help refine generative AI models through human feedback. With strong problem-solving skills and an ability to articulate technical concepts clearly, I aim to provide meaningful contributions to improve AI performance.

TECHNICAL & SOFT SKILLS
Languages: Python, Java, JavaScript, C++, SQL
Frameworks & Tools: Flask, Django, Pandas, NumPy, Scikit-learn, OAuth, Postman
Databases: PostgreSQL, MySQL, MongoDB
Data Visualization: Power BI, Tableau, Matplotlib, Seaborn
Collaboration Tools: Git, Advanced REST Client
Other Skills: API development, Data Cleaning & Automation, Debugging, Critical Thinking
Soft Skills: Communication, Problem-Solving, Attention to Detail, Adaptability, Collaboration
RELEVANT EXPERIENCE
Associate Data Analyst / Python Developer
Skoruz Technologies Ltd., Bangalore, India
Jul 2020 – Jun 2023

API Development and Integration: Developed and tested Python-based APIs, ensuring smooth data flow between systems and collaborating with multi-disciplinary teams to deliver effective solutions.
Data Analysis and Workflow Automation: Automated data pipelines and workflows with Python, streamlining processes and enhancing efficiency.
Code Debugging and Quality Assurance: Identified and resolved issues in APIs and data pipelines using critical thinking and attention to detail, significantly reducing downtime.
Technical Documentation: Maintained accurate documentation of API integrations and automation solutions, improving system traceability and reproducibility.
Key Achievements:

Developed automated dashboards that reduced manual reporting time by 50%.
Collaborated with cross-functional teams, facilitating seamless integration of multiple platforms.
Licensed Server
City of Mississauga - Recreation Division, Mississauga, ON
Aug 2024 – Present

Managed event setup and cleanup for recreational events, ensuring smooth service delivery.
Accurately processed orders and payments, adhering to safety guidelines and enhancing the customer experience.
Warehouse Associate
Leon’s Furniture, Mississauga, ON
Oct 2023 – Present

Monitored inventory levels and processed shipments to maintain operational efficiency.
Collaborated with team members to ensure timely order fulfillment and dispatch.
EDUCATION
Lambton College, Mississauga
Postgraduate Diploma in Big Data Analytics
Sep 2023 – Present

Sastra University
Bachelor of Technology in Civil Engineering
Jun 2015 – May 2019

PROJECTS AND RELEVANT EXPERIENCE
Training and Evaluating Code: Developed experience debugging and reviewing code across Python and other languages during my tenure at Skoruz Technologies.
Human Feedback for AI Models: Regularly provided feedback on automated data pipelines and implemented troubleshooting solutions, improving the reliability of data models.

"""
new_jd = """
Outlier helps the world’s most innovative companies improve their AI models by providing human feedback. Are you an experienced software engineer who would like to lend your coding expertise to train AI models?

We partner with organizations to train AI large language models, helping cutting-edge generative AI models write better code. Projects typically include discrete, highly variable problems that involve engaging with these models as they learn to code. There is no requirement for previous AI experience.

About the opportunity:

Outlier is looking for talented coders to help train generative artificial intelligence models
This freelance opportunity is remote and hours are flexible, so you can work whenever is best for you
You may contribute your expertise by…

Crafting and answering questions related to computer science in order to help train AI models
Evaluating and ranking code generated by AI models
Examples of desirable expertise:

Currently enrolled in or completed a bachelor's degree or higher in computer science at a selective institution
Proficiency working with one or more of the the following languages: Java, Python, JavaScript / TypeScript, C++
Ability to articulate complex concepts fluently in English
Excellent attention to detail, including grammar, punctuation, and style guidelines
"""

predicted_score = predict_ats_score(new_resume, new_jd)
print(f'Predicted ATS Score: {predicted_score:.2f}')


Predicted ATS Score: 1.67
