In [1]:
import pandas as pd
import re

# Load the entire dataset
file_path = '/content/sample_file.csv'
data = pd.read_csv(file_path)

In [2]:
# 1. Clean 'Experience' column to retain only the minimum experience number
def extract_min_experience(exp):
    years = re.findall(r'\d+', str(exp))
    if years:
        return int(years[0])  # Take the first (minimum) number found
    return 0  # Default to 0 if no numbers are found

data['Experience'] = data['Experience'].apply(extract_min_experience)

# Display the first few rows to verify
data.head()

Unnamed: 0,Experience,Qualifications,Job Title,Job Description,Skills,Resume
0,2,PhD,Sales Consultant,Sales Trainers develop and deliver training pr...,Sales training Sales coaching Training program...,â¢ Operating Systems: Windows â¢ Others: MS ...
1,3,B.Tech,Environmental Engineer,An Environmental Consultant assesses environme...,Environmental assessments Regulatory complianc...,Skill Set â¢â¢Cisco Certified Network Associ...
2,3,MCA,Java Developer,Java Web Application Developers create web app...,Web application development Java web framework...,"Technical Skills Web Technologies: Angular JS,..."
3,2,M.Com,Technical Writer,Documentation Specialists create and manage do...,Document management Recordkeeping Attention to...,TechnicalProficiencies DB: Oracle 11g Domains:...
4,5,M.Tech,Supply Chain Analyst,A Logistics Analyst analyzes and optimizes log...,Supply chain analysis Data analytics Inventory...,Education Details \r\nMay 2014 Diploma Nutriti...


In [3]:
import string

# 2. Standardize 'Qualifications' column by stripping whitespace, converting to lowercase, and removing punctuation
data['Qualifications'] = data['Qualifications'].str.strip().str.lower().str.replace(f"[{string.punctuation}]", "", regex=True)

# Display the first few rows to verify
data.head(5)

Unnamed: 0,Experience,Qualifications,Job Title,Job Description,Skills,Resume
0,2,phd,Sales Consultant,Sales Trainers develop and deliver training pr...,Sales training Sales coaching Training program...,â¢ Operating Systems: Windows â¢ Others: MS ...
1,3,btech,Environmental Engineer,An Environmental Consultant assesses environme...,Environmental assessments Regulatory complianc...,Skill Set â¢â¢Cisco Certified Network Associ...
2,3,mca,Java Developer,Java Web Application Developers create web app...,Web application development Java web framework...,"Technical Skills Web Technologies: Angular JS,..."
3,2,mcom,Technical Writer,Documentation Specialists create and manage do...,Document management Recordkeeping Attention to...,TechnicalProficiencies DB: Oracle 11g Domains:...
4,5,mtech,Supply Chain Analyst,A Logistics Analyst analyzes and optimizes log...,Supply chain analysis Data analytics Inventory...,Education Details \r\nMay 2014 Diploma Nutriti...


In [4]:
# Function to clean text: remove extra whitespace, URLs, and convert to lowercase
def clean_text(text):
    if pd.isnull(text):  # Handle missing values or NaN
        return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', str(text))  # Remove URLs
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text.lower()  # Convert to lowercase

# Apply cleaning to specific columns
data['Job Description'] = data['Job Description'].apply(clean_text)
data['Skills'] = data['Skills'].apply(clean_text)
data['Resume'] = data['Resume'].apply(clean_text)

# Display the first few rows to verify the cleaning
print(data[['Job Description', 'Skills', 'Resume']].head())

                                     Job Description  \
0  sales trainers develop and deliver training pr...   
1  an environmental consultant assesses environme...   
2  java web application developers create web app...   
3  documentation specialists create and manage do...   
4  a logistics analyst analyzes and optimizes log...   

                                              Skills  \
0  sales training sales coaching training program...   
1  environmental assessments regulatory complianc...   
2  web application development java web framework...   
3  document management recordkeeping attention to...   
4  supply chain analysis data analytics inventory...   

                                              Resume  
0  â¢ operating systems: windows â¢ others: ms ...  
1  skill set â¢â¢cisco certified network associ...  
2  technical skills web technologies: angular js,...  
3  technicalproficiencies db: oracle 11g domains:...  
4  education details may 2014 diploma nutrition e..

In [5]:
data['Skills'] = data['Skills'].str.replace(r'<[^>]+>', '', regex=True)  # Remove HTML

In [6]:
pip install spacy



In [7]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
import spacy
import re

# Load the SpaCy NER model
nlp = spacy.load("en_core_web_sm")

# Define a set of technical terms or keywords that should be preserved as-is
preserve_keywords = {"c++", "c#", "python", "java", "javascript", "sql", "html", "css", "r", "node.js", "react.js",
                     "angular.js", "ux/ui", "ui", "ux", "ms office", "ms excel", "ms power point", "aws", "oracle", "ruby"}

# List of entity types to exclude (e.g., organization names, person names, locations, etc.)
exclude_entities = {'ORG', 'GPE', 'LOC', 'DATE', 'TIME', 'PERSON', 'FAC', 'NORP', 'EVENT'}

# Function to preserve keywords and clean unwanted entities from text
def preserve_and_clean_entities(text):
    # Remove non-ASCII characters globally
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Replace non-ASCII characters with a space
    doc = nlp(text)
    preserved_text = []

    for token in doc:
        # Check if the token is in the preserve_keywords set (case-insensitive check)
        if token.text.lower() in preserve_keywords:
            preserved_text.append(token.text)  # Preserve exactly as-is
        # Exclude entities based on entity types (ORG, GPE, etc.)
        elif token.ent_type_ in exclude_entities:
            continue  # Skip this token if it's an unwanted entity
        # Preserve technical entities identified as programming languages, technologies, etc.
        elif token.ent_type_ in ['PRODUCT', 'LANGUAGE']:
            preserved_text.append(token.text)
        # For other tokens, remove unnecessary punctuation and convert to lowercase
        elif token.is_alpha:
            preserved_text.append(token.text.lower())
        else:
            # Remove punctuation except those in preserve_keywords
            clean_token = re.sub(r'[^\w\s]', '', token.text)  # Removes any punctuation except alphanumerics and spaces
            if clean_token:  # Only add if token is not empty after cleaning
                preserved_text.append(clean_token.lower())

    # Join words and remove extra whitespace
    return " ".join(preserved_text).strip()

# Apply the entity preservation and cleaning function to Job Description, Skills, and Resume columns
data['Cleaned_Job_Description'] = data['Job Description'].apply(lambda x: preserve_and_clean_entities(str(x)))
data['Cleaned_Skills'] = data['Skills'].apply(lambda x: preserve_and_clean_entities(str(x)))
data['Cleaned_Resume'] = data['Resume'].apply(lambda x: preserve_and_clean_entities(str(x)))

# Display the cleaned columns
print(data[['Job Description', 'Cleaned_Job_Description', 'Skills', 'Cleaned_Skills', 'Resume', 'Cleaned_Resume']].head())

                                     Job Description  \
0  sales trainers develop and deliver training pr...   
1  an environmental consultant assesses environme...   
2  java web application developers create web app...   
3  documentation specialists create and manage do...   
4  a logistics analyst analyzes and optimizes log...   

                             Cleaned_Job_Description  \
0  sales trainers develop and deliver training pr...   
1  an environmental consultant assesses environme...   
2  java developers create web applications using ...   
3  documentation specialists create and manage do...   
4  a logistics analyst analyzes and optimizes log...   

                                              Skills  \
0  sales training sales coaching training program...   
1  environmental assessments regulatory complianc...   
2  web application development java web framework...   
3  document management recordkeeping attention to...   
4  supply chain analysis data analytics invent

In [9]:
# Tokenize cleaned columns
data['Tokenized_Job_Description'] = data['Cleaned_Job_Description'].apply(lambda x: x.split())
data['Tokenized_Skills'] = data['Cleaned_Skills'].apply(lambda x: x.split())
data['Tokenized_Resume'] = data['Cleaned_Resume'].apply(lambda x: x.split())

# Convert Qualifications and Job Title into single tokens (list with one item each)
data['Tokenized_Qualifications'] = data['Qualifications'].apply(lambda x: [x.strip()] if pd.notnull(x) else [])
data['Tokenized_Job_Title'] = data['Job Title'].apply(lambda x: [x.strip().lower()] if pd.notnull(x) else [])

# Display the tokenized columns
print(data[['Tokenized_Job_Description', 'Tokenized_Skills', 'Tokenized_Resume', 'Tokenized_Qualifications', 'Tokenized_Job_Title']].head())

                           Tokenized_Job_Description  \
0  [sales, trainers, develop, and, deliver, train...   
1  [an, environmental, consultant, assesses, envi...   
2  [java, developers, create, web, applications, ...   
3  [documentation, specialists, create, and, mana...   
4  [a, logistics, analyst, analyzes, and, optimiz...   

                                    Tokenized_Skills  \
0  [sales, training, sales, coaching, training, p...   
1  [environmental, assessments, regulatory, compl...   
2  [web, application, development, java, web, fra...   
3  [document, management, recordkeeping, attentio...   
4  [supply, chain, analysis, data, analytics, inv...   

                                    Tokenized_Resume Tokenized_Qualifications  \
0  [operating, systems, windows, others, ms, exce...                    [phd]   
1  [skill, set, cisco, certified, ccna, basic, kn...                  [btech]   
2  [technical, skills, web, technologies, angular...                    [mca]   
3 

In [10]:
!pip install inflect



In [11]:
import inflect
from spacy.lang.en.stop_words import STOP_WORDS

# Initialize inflect engine for number to text conversion
p = inflect.engine()

# List of unwanted symbols/words to remove
unwanted_terms = {"&", "eg", "etc", "e.g.", "etc."}

def clean_tokens(tokens):
    cleaned_tokens = []
    for token in tokens:
        # Convert numbers to text
        if token.isdigit():
            token = p.number_to_words(token)
        # Remove unwanted terms and stopwords
        if token.lower() not in STOP_WORDS and token.lower() not in unwanted_terms:
            # Remove any remaining non-alphanumeric characters (except for preserved tokens)
            token = re.sub(r'[^\w\s]', '', token)
            if token:  # Add token if it is not empty after cleaning
                cleaned_tokens.append(token.lower())
    return cleaned_tokens

# Apply the cleaning function to each column
data['Tokenized_Job_Description'] = data['Tokenized_Job_Description'].apply(clean_tokens)
data['Tokenized_Skills'] = data['Tokenized_Skills'].apply(clean_tokens)
data['Tokenized_Resume'] = data['Tokenized_Resume'].apply(clean_tokens)

# Display the cleaned columns
print(data[['Tokenized_Job_Description', 'Tokenized_Skills', 'Tokenized_Resume']].head())

                           Tokenized_Job_Description  \
0  [sales, trainers, develop, deliver, training, ...   
1  [environmental, consultant, assesses, environm...   
2  [java, developers, create, web, applications, ...   
3  [documentation, specialists, create, manage, d...   
4  [logistics, analyst, analyzes, optimizes, logi...   

                                    Tokenized_Skills  \
0  [sales, training, sales, coaching, training, p...   
1  [environmental, assessments, regulatory, compl...   
2  [web, application, development, java, web, fra...   
3  [document, management, recordkeeping, attentio...   
4  [supply, chain, analysis, data, analytics, inv...   

                                    Tokenized_Resume  
0  [operating, systems, windows, ms, excel, ms, o...  
1  [skill, set, cisco, certified, ccna, basic, kn...  
2  [technical, skills, web, technologies, angular...  
3  [technicalproficiencies, oracle, g, domains, i...  
4  [education, details, diploma, nutrition, educa..

In [12]:
import re

# Combine tokens from Job Description, Skills, and Resume columns
all_tokens = set()
for column in ['Tokenized_Job_Description', 'Tokenized_Skills', 'Tokenized_Resume']:
    for tokens in data[column].dropna():
        all_tokens.update(tokens)

# Define a pattern to identify unwanted terms (short words, abbreviations, special characters, etc.)
unwanted_pattern = re.compile(r'\b[a-z]{1,3}\b|&|e\.g\.|etc\.|eg\b|etc\b|^\W+$', re.IGNORECASE)

# Find terms that match the unwanted pattern
unwanted_terms_found = {token for token in all_tokens if unwanted_pattern.match(token)}

# Display the identified unwanted terms
print("Identified unwanted terms:", unwanted_terms_found)

Identified unwanted terms: {'kt', 'amc', 'iii', 'wpl', 'aim', 'two quadrillion sixteen trillion two hundred and one billion four hundred and twenty million one hundred and twentytwo thousand and eight', 'ore', 'wm', 'sk', 'cem', 'csi', 'vip', 'wns', 'hpc', 'adi', 'fg', 'esd', 'slt', 'aht', 'utm', 'o', 'usa', 'd', 'apr', 'mul', 'bca', 'vpn', 'ba', 'ase', 'uit', 'far', 'cit', 'xen', 'xl', 'ide', 'ho', 'yee', 'sr', 'hoc', 'one hundred and fifty', 'php', 'url', 'alm', 'ipa', 'pm', 'br', 'mw', 'gum', 'bmc', 'eco', 'nld', 'ac', 'rac', 'tdd', 'iam', 'qa', 'asa', 'mf', 'xml', 'hpm', 'pia', 'raw', 'day', 'inc', 'ct', 'pot', 'cl', 'bse', 'hmi', 'mca', 'ds', 'two thousand and fifteen', 'pod', 'oat', 'fdm', 'cum', 'dep', 'tid', 'act', 'ecc', 'uom', 'one hundred and twenty', 'cbp', 'dg', 'ner', 'pp', 'bus', 'one hundred and sixtythree', 'wcf', 'one hundred', 'dts', 'vc', 'tax', 'imp', 'san', 'eye', 'cs', 'dbs', 'sri', 'scd', 'eot', 'k', 'tm', 'mr', 'hgs', 'tac', 'pba', 'dr', 'ai', 'mog', 'emc', 'tq

In [16]:
import inflect
from spacy.lang.en.stop_words import STOP_WORDS
import re

# Initialize inflect engine for number to text conversion
p = inflect.engine()

# Define terms to preserve (technical abbreviations)
preserve_keywords = {"c++", "c#", "python", "java", "javascript", "sql", "html", "css", "r", "node.js", "react.js",
                     "angular.js", "ux/ui", "ui", "ux", "api", "aws", "erp", "sas", "crm", "vpn", "gui", "xml"}

# Define terms to remove as noise
unwanted_terms = {"&", "eg", "etc", "e.g.", "etc.", "and", "job",
                  "man", "new", "yes", "won", "low", "skills"}

# Function to clean tokens by removing unwanted terms and symbols, converting numbers to text, and preserving relevant terms
def clean_tokens(tokens):
    cleaned_tokens = []
    for token in tokens:
        # Convert numbers to text, if they are standalone digits
        if token.isdigit():
            token = p.number_to_words(token)

        # Check if token should be preserved or cleaned
        token_lower = token.lower()
        if token_lower in preserve_keywords:
            cleaned_tokens.append(token)  # Preserve as-is
        elif token_lower not in STOP_WORDS and token_lower not in unwanted_terms:
            # Remove any remaining non-alphanumeric characters (except preserved tokens)
            clean_token = re.sub(r'[^\w\s]', '', token)
            if clean_token:  # Only add if token is not empty after cleaning
                cleaned_tokens.append(clean_token.lower())

    return cleaned_tokens

# Apply the cleaning function to each column
data['Tokenized_Job_Description'] = data['Tokenized_Job_Description'].apply(clean_tokens)
data['Tokenized_Skills'] = data['Tokenized_Skills'].apply(clean_tokens)
data['Tokenized_Resume'] = data['Tokenized_Resume'].apply(clean_tokens)

# Display the cleaned columns to verify
print(data[['Tokenized_Job_Description', 'Tokenized_Skills', 'Tokenized_Resume']].head())

                           Tokenized_Job_Description  \
0  [sales, trainers, develop, deliver, training, ...   
1  [environmental, consultant, assesses, environm...   
2  [java, developers, create, web, applications, ...   
3  [documentation, specialists, create, manage, d...   
4  [logistics, analyst, analyzes, optimizes, logi...   

                                    Tokenized_Skills  \
0  [sales, training, sales, coaching, training, p...   
1  [environmental, assessments, regulatory, compl...   
2  [web, application, development, java, web, fra...   
3  [document, management, recordkeeping, attentio...   
4  [supply, chain, analysis, data, analytics, inv...   

                                    Tokenized_Resume  
0  [operating, systems, windows, ms, excel, ms, o...  
1  [skill, set, cisco, certified, ccna, basic, kn...  
2  [technical, web, technologies, angular, js, cs...  
3  [technicalproficiencies, oracle, g, domains, i...  
4  [education, details, diploma, nutrition, educa..

In [17]:
# Correct the spelling of "exprience" to "experience" in the entire DataFrame
data = data.replace(to_replace=r'\bexprience\b', value='experience', regex=True)

In [18]:
# Clean the Job Title column: remove whitespace, convert to lowercase, and tokenize as a single-element list
data['Cleaned_Job_Title'] = data['Job Title'].apply(lambda x: [x.strip().lower()] if pd.notnull(x) else [])

# Display the original and cleaned Job Title columns to verify
print(data[['Job Title', 'Cleaned_Job_Title']].head())

                Job Title         Cleaned_Job_Title
0        Sales Consultant        [sales consultant]
1  Environmental Engineer  [environmental engineer]
2          Java Developer          [java developer]
3        Technical Writer        [technical writer]
4    Supply Chain Analyst    [supply chain analyst]
