In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import pdfplumber
import docx
import joblib
import re

In [2]:
# Load the CSV files
df1 = pd.read_csv('UpdatedResumeDataSet.csv')
df2 = pd.read_csv('updated_technical_resumesdataset.csv')
df3 = pd.read_csv('updated_non_technical_resumesdataset.csv')

# Concatenate them into a single DataFrame
combined_dataset = pd.concat([df1, df2, df3], ignore_index=True)

# Optional: Save the combined data to a new CSV file
combined_dataset.to_csv('combined_dataset_resume.csv', index=False)


In [3]:
resume_data = pd.read_csv("combined_dataset_resume.csv")
resume_data

Unnamed: 0,Category,Resume,Job_Description,Score
0,WEB DEVELOPER,email campaign developer email campaign develo...,,
1,WEB DEVELOPER,front end web developer front end web develope...,,
2,WEB DEVELOPER,fullstack web developer fullstack web develope...,,
3,WEB DEVELOPER,senior full stack web developer senior full st...,,
4,WEB DEVELOPER,contractor contractor contractor fujifilm hold...,,
...,...,...,...,...
11357,PMO,AREA OF EXPERTISE (PROFILE) Around 10 plus yea...,,
11358,PMO,Skills Exceptional communication and networkin...,,
11359,PMO,CORE COMPETENCIES â¢ Maintain processes to en...,,
11360,PMO,AREA OF EXPERTISE (PROFILE) Around 10 plus yea...,,


In [7]:
resume_data1 = resume_data[resume_data['Category'] != 'WEB DEVELOPER']
resume_data1

Unnamed: 0,Category,Resume,Job_Description,Score
200,Database Administrator,"Proficient in Optimization, Cloud Databases, S...",,
201,Database Administrator,"Proficient in Database Management, Troubleshoo...",,
202,Cybersecurity Analyst,"Proficient in Penetration Testing, Incident Re...",,
203,Software Engineer,"Proficient in Software Development, Java, Pyth...",,
204,Machine Learning Engineer,"Proficient in Python, Data Science, Data Analy...",,
...,...,...,...,...
11357,PMO,AREA OF EXPERTISE (PROFILE) Around 10 plus yea...,,
11358,PMO,Skills Exceptional communication and networkin...,,
11359,PMO,CORE COMPETENCIES â¢ Maintain processes to en...,,
11360,PMO,AREA OF EXPERTISE (PROFILE) Around 10 plus yea...,,


In [8]:
resume_data["Category"].unique()

array(['Database Administrator', 'Cybersecurity Analyst',
       'Software Engineer', 'Machine Learning Engineer', 'Web Developer',
       'Systems Analyst', 'AI Researcher', 'Data Analyst',
       'Cloud Architect', 'AI Specialist', 'Robotics Engineer',
       'Data Science', 'Web Designing', 'Java Developer', 'SAP Developer',
       'Automation Testing', 'Electrical Engineering', 'Python Developer',
       'DevOps Engineer', 'Network Security Engineer', 'Database',
       'Hadoop', 'ETL Developer', 'DotNet Developer', 'Blockchain',
       'Testing', 'Fitness Coach', 'Physician', 'Financial Analyst',
       'Supply Chain Manager', 'Architect', 'Operations Manager',
       'Urban Planner', 'Personal Trainer', 'Biomedical Engineer',
       'Nurse', 'Product Manager', 'Content Writer', 'Pharmacist', 'Chef',
       'Psychologist', 'Civil Engineer', 'Accountant', 'Graphic Designer',
       'Dentist', 'Pilot', 'UX Designer', 'Teacher', 'HR Specialist',
       'Veterinarian', 'Environmental 

In [9]:
tfidf = TfidfVectorizer(max_features=5000)
x = tfidf.fit_transform(resume_data1['Resume']).toarray()
y = resume_data1.iloc[:, 0]

In [10]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
y

200         Database Administrator
201         Database Administrator
202          Cybersecurity Analyst
203              Software Engineer
204      Machine Learning Engineer
                   ...            
11357                          PMO
11358                          PMO
11359                          PMO
11360                          PMO
11361                          PMO
Name: Category, Length: 10962, dtype: object

In [13]:
resume_data1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10962 entries, 200 to 11361
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Category         10962 non-null  object 
 1   Resume           10962 non-null  object 
 2   Job_Description  0 non-null      float64
 3   Score            0 non-null      float64
dtypes: float64(2), object(2)
memory usage: 428.2+ KB


In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [15]:
rf = RandomForestClassifier(n_estimators=300, max_depth=20)
rf.fit(x_train, y_train)

In [16]:
TrainingAccuracy = rf.score(x_train,y_train)*100
TestingAccuraccy = rf.score(x_test,y_test)*100
print(f"Training accuracy : {TrainingAccuracy:.2f}")
print(f"Testing accuracy : {TestingAccuraccy:.2f}")

Training accuracy : 97.98
Testing accuracy : 97.67


In [17]:
import pickle

In [18]:
with open("combined_job_predict_model1.pkl", "wb") as f:
    pickle.dump(rf,f)

In [19]:
with open("combined_tfidf_vectorizer1.pkl", "wb") as f:
    pickle.dump(tfidf, f)

In [20]:
from sklearn.metrics import classification_report

y_pred = rf.predict(x_test)
print(classification_report(y_test, y_pred))


                                 precision    recall  f1-score   support

                  AI Researcher       1.00      1.00      1.00        30
                  AI Specialist       0.96      0.87      0.92        31
                     Accountant       1.00      1.00      1.00        40
                       Advocate       1.00      0.50      0.67         2
                      Architect       1.00      1.00      1.00        32
                           Arts       0.00      0.00      0.00        10
             Automation Testing       1.00      1.00      1.00         6
            Biomedical Engineer       1.00      1.00      1.00        43
                     Blockchain       1.00      1.00      1.00         8
               Business Analyst       1.00      0.98      0.99        41
                           Chef       1.00      1.00      1.00        33
                 Civil Engineer       1.00      1.00      1.00        49
                Cloud Architect       1.00      1.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
