In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import re

In [6]:
df = pd.read_excel('Resume Ranking Data set.xlsx')
print(df.head())

                             accomplishments_segment             degree  \
0                                                NaN  MS , B.TECH , PhD   
1                                                NaN  B.E , MS , M.TECH   
2  Successfully cleared and completed the course ...   MS , B.TECH , BS   
3                                                NaN      B.E , MS , BS   
4                                                NaN   MS , B.TECH , BS   

                                   education_segment  \
0                                                NaN   
1  M.Tech. Data Science\nJSS Science And Technolo...   
2  Degree\nInstitute\nBoard/University\nYear\nPer...   
3                                                NaN   
4                                                NaN   

                         emails  \
0    sutarvinayak2063@gmail.com   
1             pooj.vs@gmail.com   
2  khandelwal.ashwin5@gmail.com   
3                           NaN   
4                           NaN   

 

In [7]:
print(df.columns)

Index(['accomplishments_segment', 'degree', 'education_segment', 'emails',
       'job_titles', 'links', 'misc_segment', 'name', 'objectives_segment',
       'phone', 'projects_segment', 'skills', 'skills_segment', 'text',
       'university_0', 'university_1', 'university_2', 'university_3',
       'university_4', 'university_5', 'url', 'work_experience',
       'work_segment'],
      dtype='object')


In [8]:
# Clean and preprocess relevant text columns
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text
    return ''


In [9]:
# Fill NaN values with an empty string
df['skills'] = df['skills'].fillna('').apply(clean_text)
df['work_experience'] = df['work_experience'].fillna('').apply(clean_text)
df['education_segment'] = df['education_segment'].fillna('').apply(clean_text)
df['projects_segment'] = df['projects_segment'].fillna('').apply(clean_text)
df['job_titles'] = df['job_titles'].fillna('')
df['skills_segment'] = df['skills_segment'].fillna('').apply(clean_text)
df['work_segment'] = df['work_segment'].fillna('').apply(clean_text)

In [12]:
# Combine features into a single text field
df['combined_text'] = df['skills'] + ' ' + df['work_experience'] + ' ' + df['education_segment'] + ' ' + df['projects_segment'] + ' ' + df['skills_segment'] + ' ' + df['work_segment']

In [13]:
# Create a mapping for job titles to combine less frequent classes
def map_job_titles(title):
    if 'software engineer' in title:
        return 'software engineer'
    if 'developer' in title:
        return 'developer'
    if 'data scientist' in title or 'data analyst' in title:
        return 'data scientist'
    if 'manager' in title:
        return 'manager'
    if 'intern' in title:
        return 'intern'
    return 'other'

df['job_titles_mapped'] = df['job_titles'].apply(map_job_titles)


In [14]:
# Use 'job_titles_mapped' as the target variable
X = df['combined_text']
y = df['job_titles_mapped']

# Remove rows where the target variable is empty after filling NaNs
df = df[df['job_titles_mapped'] != '']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [16]:
# Balancing the classes using SMOTE
smote = SMOTE(random_state=42)
X_train_tfidf_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)


In [17]:
# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_tfidf_balanced, y_train_balanced)


In [18]:
# Predict on the test set
y_pred_rf = rf_model.predict(X_test_tfidf)

In [19]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Accuracy: 0.5682539682539682
Classification Report:
                    precision    recall  f1-score   support

   data scientist       0.50      0.12      0.20         8
        developer       0.40      0.37      0.39        62
           intern       0.15      0.20      0.17        10
          manager       0.54      0.60      0.57        42
            other       0.70      0.79      0.74       134
software engineer       0.49      0.37      0.42        59

         accuracy                           0.57       315
        macro avg       0.46      0.41      0.42       315
     weighted avg       0.56      0.57      0.56       315



In [20]:
# Train the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_tfidf_balanced, y_train_balanced)

In [21]:
#predict on the test set
y_pred_gb = gb_model.predict(X_test_tfidf)

In [22]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))

Accuracy: 0.6634920634920635
Classification Report:
                    precision    recall  f1-score   support

   data scientist       0.67      0.50      0.57         8
        developer       0.50      0.52      0.51        62
           intern       0.50      0.40      0.44        10
          manager       0.70      0.62      0.66        42
            other       0.75      0.84      0.79       134
software engineer       0.62      0.53      0.57        59

         accuracy                           0.66       315
        macro avg       0.62      0.57      0.59       315
     weighted avg       0.66      0.66      0.66       315



In [23]:
from sklearn.svm import SVC

# Train an SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf_balanced, y_train_balanced)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.5841269841269842
SVM Classification Report:
                    precision    recall  f1-score   support

   data scientist       0.50      0.12      0.20         8
        developer       0.44      0.44      0.44        62
           intern       0.14      0.20      0.17        10
          manager       0.56      0.57      0.56        42
            other       0.70      0.76      0.73       134
software engineer       0.56      0.47      0.51        59

         accuracy                           0.58       315
        macro avg       0.48      0.43      0.44       315
     weighted avg       0.58      0.58      0.58       315



#So we can see that the gradient boosting model gives us more accuracy than random forest as well as svm models

In [24]:
# Function to rank resumes based on a given job description
def rank_resumes(job_description, resumes_df, model, vectorizer):
    job_description_cleaned = clean_text(job_description)
    job_description_tfidf = vectorizer.transform([job_description_cleaned])

    resumes_tfidf = vectorizer.transform(resumes_df['combined_text'])
    similarities = cosine_similarity(job_description_tfidf, resumes_tfidf).flatten()

    # Predict job titles using the model
    predictions = model.predict(resumes_tfidf)
    resumes_df['predicted_job_title'] = predictions

    # Score resumes based on predicted relevance to the job title in the description
    # (assuming the job description contains the target job title, here 'data scientist')
    target_job_title = 'data scientist'  # Extract this dynamically if needed
    relevance_scores = resumes_df['predicted_job_title'].apply(lambda x: 1 if x == target_job_title else 0)

    resumes_df['similarity_score'] = similarities
    resumes_df['combined_score'] = resumes_df['similarity_score'] * 0.5 + relevance_scores * 0.5

    ranked_resumes = resumes_df.sort_values(by='combined_score', ascending=False)

    return ranked_resumes

# Example job description
job_description = "Data Scientist, Pune, India, XYZ Analytics, 3+ yrs exp, analyze large datasets, develop predictive models, collaborate cross-functional teams, present findings, Bachelor's/Master's in CS/Statistics/Math, Python/R/Java, strong analytical skills, excellent communication, experience with big data technologies/cloud platforms/data visualization tools preferred, competitive salary/benefits, career growth opportunities, flexible work hours/remote work options, send resume/cover letter to careers@xyzanalytics.com."

# Rank the resumes
ranked_resumes = rank_resumes(job_description, df, gb_model, vectorizer)
print(f"Ranked Resumes:\n{ranked_resumes[['name', 'job_titles', 'similarity_score', 'combined_score']].head(10)}")

Ranked Resumes:
                       name  \
186        Machine Learning   
185        Machine Learning   
595   Letterkenny Institute   
668          Waqqas Khusraw   
1340                  AI ML   
423               Rahul Jha   
1294         Data Scientist   
602                 AWS EC2   
375            MSc Computer   
285           Deep Learning   

                                             job_titles  similarity_score  \
186                     work from home , data scientist          0.148567   
185                     work from home , data scientist          0.148567   
595                       work from home , data analyst          0.126383   
668                         classifier , data scientist          0.123700   
1340  project manager , science manager , devops , p...          0.121388   
423   work from home , machine learning engineer , d...          0.120353   
1294  full time , learning specialist , data scienti...          0.113377   
602   quality assurance en