# Install the Required libraries

In [None]:
!pip install python-docx PyPDF2

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, PyPDF2
Successfully installed PyPDF2-3.0.1 python-docx-1.1.2


# Code To Extract Text From Resumes And Creating DataFrame

In [None]:
import os
import pandas as pd
from PyPDF2 import PdfReader
from docx import Document

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        text = text.replace("\t", " ").replace("\n", " ")
        text = text.lower()
        return text
    except:
        return "error with PDF file"

# Function to extract text from DOCX
def extract_text_from_docx(docx_path):
    try:
        doc = Document(docx_path)
        text = " ".join([para.text for para in doc.paragraphs])
        text = text.replace("\t", " ").replace("\n", " ")
        text = text.lower()

        return text
    except:
        return "Error with Doc File"


# Main function to process resumes
def process_resumes(folder_path):
    data = []
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        if file.endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
        elif file.endswith(('.docx', '.doc')):
            text = extract_text_from_docx(file_path)
        else:
            continue

        # Extract the file name without the extension
        file_name = os.path.splitext(file)[0]

        # Append the text and file name to the data list
        data.append({'Text': text, 'File Name': file_name})

    return pd.DataFrame(data)

#Calling the Function
folder_path = "/content/drive/MyDrive/NLP/Project/Resumes"
df = process_resumes(folder_path)
print(df)


                                                 Text                File Name
0   k a n d a c e  l o u d o r d a t a  s c i e n ...     KandaceLoudor_Resume
1   trish mathers entry-level data scientist innov...      TrishMathers_Resume
2   terrence coleman tcoleman@email.com (123) 456-...   TerrenceColeman_Resume
3   j a c k s o n m a c a r t h u r w e b  d e v e...  JacksonMacarthur_Resume
4   w i n s t o n   r o s e n b e r g w e b  d e v...  WinstonRosenberg_Resume
5   sunshine smith web developer contact s_smith@e...     SunshineSmith_Resume
6   thalassa voss etl tester contact t.voss@email....      ThalassaVoss_Resume
7   lysander hale penetration tester l.hale@email....      LysanderHale_Resume
8   em ery harrison qa tester contact e.harrison@e...     EmeryHarrison_Resume
9    career objective a future-driven and methodic...      TylerRussell_Resume
10  yasmin patel data scientist intern  y.patel@em...       YasminPatel_Resume
11  jeong wu kim entry-level web developer softwar..

# Saving the Dataframe

In [None]:
df.to_csv('resumes_text.csv', index=False)

# Predefined Skills List .
### we are enterning skills by our own which is not a well efficient process but we are focusing in the project flow, once it is done we can go with LLM to automate this process.

In [None]:
skill_list = [
    skill.strip().lower()
    for skill in """
        Mathematics
        Statistics
        Linear Algebra
        Calculus
        Probability
        Statistical Inference
        Python
        Data Wrangling
        Data Cleaning
        Data Transformation
        Data Preparation
        Data Visualization
        SQL
        Machine Learning
        Linear Regression
        Logistic Regression
        Decision Trees
        Statistical Modeling
        Data Mining
        Tableau
        Power BI
        Hadoop
        Spark
        Deep Learning
        Neural Networks
        Natural Language Processing
        Cloud Computing
        AWS
        Azure
        GCP
        Experiment Design
        A/B Testing
        Model Deployment
        NumPy
        Pandas
        SciPy
        Scikit-learn
        Matplotlib
        Seaborn
        TensorFlow
        Keras
        PyTorch
        NLTK
        SpaCy
        HTML
        CSS
        JavaScript
        Git
        React
        Angular
        Vue.js
        Node.js
        Python
        Ruby
        Responsive Design
        Testing
        Debugging
        State Management
        Performance Optimization
        Accessibility
        Express.js
        Django
        Ruby on Rails
        Database Management
        DevOps
        CI/CD
        Security
        Bootstrap
        Tailwind CSS
        Sass/SCSS
        Webpack
        Babel
        RESTful APIs
        GraphQL
        WebSockets
        Docker
        Kubernetes
        MongoDB
        PostgreSQL
        MySQL
        Software Testing Fundamentals
        Test Case Design
        Test Execution
        Bug Reporting
        Black-Box Testing
        White-Box Testing
        Gray-Box Testing
        Jira
        TestRail
        Domain Knowledge
        Agile Testing
        Test Strategy
        Test Planning
        SDLC
        STLC
        Exploratory Testing
        Usability Testing
        User Acceptance Testing
        Test Management Tools
        Programming Languages
        Test Automation Frameworks
        API Testing Tools
        Version Control
        Design Patterns
        Test Data Management
        Cross-Browser Testing
        Mobile Testing
        Performance Testing Tools
        Load Testing
        Stress Testing
        Endurance Testing
        Performance Monitoring Tools
        Capacity Planning
        Network Protocols
        Security Testing Methodologies
        Vulnerability Scanning Tools
        Penetration Testing
        Web Application Security Testing
        Network Security Testing
        Code Analysis Tools
        Ethical Hacking Concepts
        Mobile Testing Frameworks
        Mobile Device Testing
        Mobile Network Testing
        Mobile Usability Testing
        Mobile Performance Testing
        API Contract Testing
        API Security Testing
        Data Validation
        Data Integrity Testing
        Performance Testing of Databases
        Kali Linux
        Sleuth Kit
        Metasploit Framework
        Nmap
        QualysGuard
        Exploit-DB
        Aircrack-ng
        OWASP WebScarab
        Sleuth Kit
        Kali Linux
        VMware Workstation
        Zephyr
        Appium
        Java
        Gherkin
        Bugzilla
        Jenkins
        LoadRunner
        Burp Suite
        SQL
        Scrum
        TestNG
        JIRA
        Jenkins
        Agile
        Source Versioning
        JUnit
        Scrum
        tester
    """.split("\n")
    if skill.strip()
]


# Spacy Model we are using to extract skills by comparing it with the list we have.

In [None]:
import spacy

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Function to extract skills using spaCy
def spacy_skill_extraction(text, skill_list):
    doc = nlp(text)
    extracted_skills = [token.text.lower() for token in doc if token.text.lower() in skill_list]
    return extracted_skills

df['Spacy_Extracted_Skills'] = df['Text'].apply(lambda x: spacy_skill_extraction(x, skill_list))
print(df)


                                                 Text  \
0   k a n d a c e  l o u d o r d a t a  s c i e n ...   
1   trish mathers entry-level data scientist innov...   
2   terrence coleman tcoleman@email.com (123) 456-...   
3   j a c k s o n m a c a r t h u r w e b  d e v e...   
4   w i n s t o n   r o s e n b e r g w e b  d e v...   
5   sunshine smith web developer contact s_smith@e...   
6   thalassa voss etl tester contact t.voss@email....   
7   lysander hale penetration tester l.hale@email....   
8   em ery harrison qa tester contact e.harrison@e...   
9    career objective a future-driven and methodic...   
10  yasmin patel data scientist intern  y.patel@em...   
11  jeong wu kim entry-level web developer softwar...   
12  xander clemmons xclemm@email.com (123) 456-789...   
13    work experience tester splunk  2017 - curren...   

                  File Name                             Spacy_Extracted_Skills  
0      KandaceLoudor_Resume  [statistics, python, numpy, pandas

In [None]:
df['Spacy_Extracted_Skills'] = df['Spacy_Extracted_Skills'].apply(lambda x: ', '.join(x))

In [None]:
df

Unnamed: 0,Text,File Name,Spacy_Extracted_Skills
0,k a n d a c e l o u d o r d a t a s c i e n ...,KandaceLoudor_Resume,"statistics, python, numpy, pandas, keras, sql,..."
1,trish mathers entry-level data scientist innov...,TrishMathers_Resume,"sql, statistics, mathematics, calculus, calcul..."
2,terrence coleman tcoleman@email.com (123) 456-...,TerrenceColeman_Resume,"python, testing, python, python, sql, mathemat..."
3,j a c k s o n m a c a r t h u r w e b d e v e...,JacksonMacarthur_Resume,"javascript, html, css, node.js, mongodb, aws, ..."
4,w i n s t o n r o s e n b e r g w e b d e v...,WinstonRosenberg_Resume,"accessibility, javascript, html, css, python, ..."
5,sunshine smith web developer contact s_smith@e...,SunshineSmith_Resume,"angular, css"
6,thalassa voss etl tester contact t.voss@email....,ThalassaVoss_Resume,"tester, sql, git, tableau, tester, sql, git, s..."
7,lysander hale penetration tester l.hale@email....,LysanderHale_Resume,"tester, tester, security, security, security, ..."
8,em ery harrison qa tester contact e.harrison@e...,EmeryHarrison_Resume,"tester, zephyr, appium, java, gherkin, bugzill..."
9,career objective a future-driven and methodic...,TylerRussell_Resume,"sql, pandas, spark, aws, agile, python"


In [None]:
df['Spacy_Extracted_Skills'][0]

'statistics, python, numpy, pandas, keras, sql, mysql, git, python, tableau'

# **Feature** **Extraction**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
# Transform the text data
X = tfidf_vectorizer.fit_transform(df['Spacy_Extracted_Skills'])

# Check the shape of the feature matrix
print("Shape of X (Feature Matrix):", X.shape)


Shape of X (Feature Matrix): (14, 39)


In [None]:
# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(tfidf_df.head())

   accessibility  agile  angular  appium      aws  bootstrap  bugzilla  \
0       0.000000    0.0      0.0     0.0  0.00000        0.0       0.0   
1       0.000000    0.0      0.0     0.0  0.00000        0.0       0.0   
2       0.000000    0.0      0.0     0.0  0.00000        0.0       0.0   
3       0.177740    0.0      0.0     0.0  0.35548        0.0       0.0   
4       0.480833    0.0      0.0     0.0  0.00000        0.0       0.0   

   calculus       css  docker  ...  qualysguard    react  security  spark  \
0  0.000000  0.000000     0.0  ...          0.0  0.00000       0.0    0.0   
1  0.581304  0.000000     0.0  ...          0.0  0.00000       0.0    0.0   
2  0.000000  0.000000     0.0  ...          0.0  0.00000       0.0    0.0   
3  0.000000  0.130527     0.0  ...          0.0  0.17774       0.0    0.0   
4  0.000000  0.353109     0.0  ...          0.0  0.00000       0.0    0.0   

        sql  statistics   tableau  tester   testing  zephyr  
0  0.214784    0.344138  0.344

In [None]:
tfidf_df.iloc[:,5:30]

Unnamed: 0,bootstrap,bugzilla,calculus,css,docker,gherkin,git,html,java,javascript,...,mathematics,mongodb,mysql,nmap,node,numpy,pandas,probability,python,qualysguard
0,0.0,0.0,0.0,0.0,0.0,0.0,0.344138,0.0,0.0,0.0,...,0.0,0.0,0.276769,0.0,0.0,0.306198,0.306198,0.0,0.464789,0.0
1,0.0,0.0,0.581304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.335417,0.0,0.134878,0.0,0.0,0.0,0.0,0.193768,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.505982,0.0,0.203465,0.0,0.0,0.2251,0.2251,0.0,0.683374,0.0
3,0.0,0.0,0.0,0.130527,0.0,0.0,0.0,0.142945,0.0,0.428836,...,0.0,0.142945,0.0,0.0,0.53322,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.353109,0.0,0.0,0.0,0.386705,0.0,0.386705,...,0.0,0.386705,0.0,0.0,0.0,0.0,0.0,0.0,0.324704,0.0
5,0.0,0.0,0.0,0.53642,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.418602,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.299531,0.0,0.0,0.0,0.0,0.0,0.299531
8,0.0,0.317023,0.0,0.0,0.0,0.317023,0.0,0.0,0.137194,0.0,...,0.0,0.110337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.386228,0.0,0.293135,0.0


In [None]:
df['Designation'] = ['Data Scientist','Data Scientist','Data Scientist','web developer','web developer','web developer','tester','tester','tester','Data Scientist','Data Scientist','web developer','web developer','tester']

In [None]:
df

Unnamed: 0,Text,File Name,Spacy_Extracted_Skills,Designation
0,k a n d a c e l o u d o r d a t a s c i e n ...,KandaceLoudor_Resume,"statistics, python, numpy, pandas, keras, sql,...",Data Scientist
1,trish mathers entry-level data scientist innov...,TrishMathers_Resume,"sql, statistics, mathematics, calculus, calcul...",Data Scientist
2,terrence coleman tcoleman@email.com (123) 456-...,TerrenceColeman_Resume,"python, testing, python, python, sql, mathemat...",Data Scientist
3,j a c k s o n m a c a r t h u r w e b d e v e...,JacksonMacarthur_Resume,"javascript, html, css, node.js, mongodb, aws, ...",web developer
4,w i n s t o n r o s e n b e r g w e b d e v...,WinstonRosenberg_Resume,"accessibility, javascript, html, css, python, ...",web developer
5,sunshine smith web developer contact s_smith@e...,SunshineSmith_Resume,"angular, css",web developer
6,thalassa voss etl tester contact t.voss@email....,ThalassaVoss_Resume,"tester, sql, git, tableau, tester, sql, git, s...",tester
7,lysander hale penetration tester l.hale@email....,LysanderHale_Resume,"tester, tester, security, security, security, ...",tester
8,em ery harrison qa tester contact e.harrison@e...,EmeryHarrison_Resume,"tester, zephyr, appium, java, gherkin, bugzill...",tester
9,career objective a future-driven and methodic...,TylerRussell_Resume,"sql, pandas, spark, aws, agile, python",Data Scientist


# **Prepare Labels (y)**

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

y = labelencoder.fit_transform(df['Designation'])
print("encoded label for y :",y)

encoded label for y : [0 0 0 2 2 2 1 1 1 0 0 2 2 1]


# **Model** **Building**

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Features Shape:", x_train.shape)
print("Testing Features Shape:", x_test.shape)


Training Features Shape: (11, 39)
Testing Features Shape: (3, 39)


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
model.fit(x_train, y_train)

In [None]:
# Predict on the test set
y_pred = model.predict(x_test)

In [None]:
y_pred

array([2, 2, 0])

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6666666666666666

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           2       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3


Confusion Matrix:
[[1 1]
 [0 1]]


## **Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the model
model_nb = MultinomialNB()

# Train the model
model_nb.fit(x_train, y_train)

# Predict on the test set
y_pred_NB = model_nb.predict(x_test)

In [None]:
y_pred_NB

array([2, 2, 0])

In [None]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred_NB)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_NB))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_NB))


Accuracy: 0.6666666666666666

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           2       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3


Confusion Matrix:
[[1 1]
 [0 1]]


# **Saving The Model And Vectorizer**

In [None]:
import joblib

joblib.dump(model_nb, 'naive_bayes_resume_classifier.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!
