In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
import re
import joblib

In [2]:
training_df = pd.read_excel("job_listing_data.xlsx")

In [3]:
training_df = training_df.drop(columns=["Requirements"])

In [4]:
training_df.head()

Unnamed: 0,EDUCATION,EXPERIENCE,SKILLS,NOISE,RESPONSIBILITIES
0,A relevant Bachelor's degree,A minimum of 3 years of relevant experience,"Proficiency in Python for data analysis, machi...","We’re a diverse team of more than 3,000 tech e...","Manage and grow social media accounts, create ..."
1,Bachelor's or Master degree,2-6 years of relevant work experience in data ...,The insight to take ambiguous problems and sol...,"Working across industries and disciplines, we ...","Design and code user interfaces for websites, ..."
2,"Master’s Degree or higher in Computer Science,...","2+ years of experience in data analytics, cons...",Proficiency in at least one programming langua...,"We go beyond what was once thought possible, c...",Establish and enforce quality control standard...
3,Bachelor’s or Master’s degree in Computer Scie...,At least 2 years of deep-learning/NLP experien...,"Very experienced with SQL queries, Basic ETL, ...",Leveraging BCG’s global network and partnershi...,"Design, configure, and optimize wireless netwo..."
4,"Tertiary qualifications in ICT, Computer Scien...",Minimum of 3 years of industry experience in D...,Proficiency in one or more analytics & visuali...,"Together, we strive to create solutions that w...",Specialize in conference and convention planni...


In [5]:
##---Preprocess Into Long Format---##

long_df = training_df.melt(var_name="Category", value_name="Text")

long_df = long_df.dropna().reset_index(drop=True)

long_df

Unnamed: 0,Category,Text
0,EDUCATION,A relevant Bachelor's degree
1,EDUCATION,Bachelor's or Master degree
2,EDUCATION,"Master’s Degree or higher in Computer Science,..."
3,EDUCATION,Bachelor’s or Master’s degree in Computer Scie...
4,EDUCATION,"Tertiary qualifications in ICT, Computer Scien..."
...,...,...
822,RESPONSIBILITIES,Enhance model performance through feature engi...
823,RESPONSIBILITIES,Conduct experiments to test hypotheses and der...
824,RESPONSIBILITIES,Create compelling visualizations to effectivel...
825,RESPONSIBILITIES,Identify new applications for data science to ...


### Vectorize

In [7]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(long_df["Text"])
vocabulary = vectorizer.vocabulary_
vectorizer.get_feature_names_out()

array(['000', '00am', '10', ..., 'zoning', 'zoology', 'zoom'],
      dtype=object)

In [8]:
filename = '../models/job_listing_vectorizer.joblib'

joblib.dump(vectorizer, filename)

['../models/job_listing_vectorizer.joblib']

In [9]:
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [10]:
long_df = long_df.drop(columns=["Text"])

train_data = pd.concat([tfidf_df, long_df], axis=1)

train_data

Unnamed: 0,000,00am,10,100,12,12d,14,15,1year,20,...,years,you,your,zealand,zero,zone,zoning,zoology,zoom,Category
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,EDUCATION
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,EDUCATION
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,EDUCATION
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,EDUCATION
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,EDUCATION
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,RESPONSIBILITIES
823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,RESPONSIBILITIES
824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,RESPONSIBILITIES
825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,RESPONSIBILITIES


In [11]:
vocabulary

{'relevant': 1792,
 'bachelor': 227,
 'degree': 602,
 'or': 1493,
 'master': 1337,
 'higher': 1033,
 'in': 1092,
 'computer': 456,
 'science': 1893,
 'data': 579,
 'related': 1787,
 'field': 877,
 'statistics': 2045,
 'mathematics': 1342,
 'artificial': 181,
 'intelligence': 1151,
 'machine': 1306,
 'learning': 1248,
 'tertiary': 2151,
 'qualifications': 1721,
 'ict': 1064,
 'information': 1120,
 'systems': 2119,
 'retrieval': 1843,
 'nlp': 1444,
 'vision': 2313,
 'multimodal': 1419,
 'fields': 878,
 'ai': 100,
 'engineering': 759,
 'formal': 919,
 'education': 716,
 'technical': 2139,
 'physics': 1582,
 'etc': 795,
 'with': 2371,
 'an': 121,
 'emphasis': 739,
 'on': 1471,
 'software': 1989,
 'development': 637,
 'of': 1463,
 'business': 301,
 'administration': 65,
 'masters': 1338,
 'arts': 184,
 'applications': 156,
 'technology': 2143,
 'similar': 1970,
 'commerce': 424,
 'doctor': 679,
 'philosophy': 1578,
 'bs': 292,
 'study': 2076,
 'equivalent': 786,
 'practical': 1627,
 'experi

In [12]:
encoder = OrdinalEncoder()

encoder.fit(train_data[["Category"]])

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [36]:
train_data["Category"] = encoder.transform(train_data[["Category"]])

In [40]:
filename = '../models/job_listing_label_encoder.joblib'

joblib.dump(encoder, filename)

['../models/job_listing_label_encoder.joblib']

In [42]:
train_data["Category"].value_counts()

Category
3.0    325
2.0    255
4.0     95
1.0     89
0.0     63
Name: count, dtype: int64

### Train

In [45]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
X_train = train_data.iloc[:, :-1]
Y_train = train_data.iloc[:, -1]

In [49]:
rf = RandomForestClassifier()

In [50]:
rf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [52]:
import joblib

filename = '../models/job_listing_rf_classifier.joblib'

joblib.dump(rf, filename)

['../models/job_listing_rf_classifier.joblib']

### Chunk and Classify Job Listing For Testing

In [54]:
job_listing = """
Melbourne, Sydney

Data Scientist - Junior, Mid and Senior Opportunities
Choosing Capgemini means choosing a company where you will be empowered to shape your career in the way you’d like, where you’ll be supported and inspired by a collaborative community of colleagues around the world, and where you’ll be able to reimagine what’s possible. Join us and help the world’s leading organizations unlock the value of technology and build a more sustainable, more inclusive world.

Job Description
About Capgemini:

Capgemini is a diverse collective of more than 350,000 strategic and technological experts based across more than 50 countries, partnering with world-renowned clients to transform and manage their businesses.

We are dedicated to leveraging cloud, data, AI, connectivity, software, digital engineering, and platforms to address the entire breadth of their business needs. This passion drives a powerful commitment - to unlock the true value of technology.

Our business in Australia and New Zealand has over 3,000 team members devoted to helping clients get the future they want.

Now is the time to join our rapidly growing team who are at the forefront of finding new ways technology can help us reimagine what’s possible, collecting unique career experiences with global brands and game-changing tech projects.

About the Service Line:

Capgemini Global Insights & Data business line is a market leader in Data Engineering, Cloud Data Platforms, Data Science, and AI and Advanced Analytics across all sectors including financial services, public sector, consumer products, telecommunication & energy resources. Our offerings include end-2-end data integration to cloud platforms, complete suite of data engineering capabilities and a category of sector-based advanced analytics and AI driven solutions.

This is a unique opportunity within the Insights & Data (I&D) Global Business Line (GBL) to help grow the data, AI and Data Science engagements across all sectors in Australia and New Zealand. You will help shape and deliver complex projects and programs of work for our clients. You will understand our client’s need to bring maximum value, impact, and innovation to the market as you oversee the delivery of client engagements for data and AI initiatives.

Let’s talk about the roles and responsibilities:


Experience working in the field of generative AI applying large language models to real world problems.
Experience working with agentic AI frameworks such as Langgraph, Autogen and CrewAI etc.
Build data science and AI solutions for clients relating to their business problems in areas such as generative AI, NLP, computer vision, machine learning, etc.
Collaborate with sales and engagement teams to build client proposals in response to RFPs.
Experience in designing end-to-end Generative AI/ Data Science solutions.
Experience and Qualifications:
We’re interested in hearing from people across junior, mid and senior levels.
For Mid and senior levels, we are seeking extensive experience in AI and Machine Learning: Proven track record of leading, developing and implementing AI and machine learning solutions across various industries.
Strong Programming Skills: Proficiency in Python programming
Experience in Data Analysis and Visualization: Strong skills in data analysis, visualization, and tools such as Tableau, Power BI, and other data visualization platforms(preferred).
Degree in Data Science or Related Field: Bachelor’s or master’s in data science, Computer Science, Statistics, or a related field.
Due to the nature of the role, Australian citizens are highly desirable.
What You’ll Love About Working Here:
Exposure to transformational programs in AI & Data portfolio.
Career growth through learning platforms, certifications, and global mobility.
Inclusive culture backed by our “Inclusive Future for All” commitment.
Competitive total rewards and recognition programs.
Capgemini is an AI-powered global business and technology transformation partner, delivering tangible business value. We imagine the future of organizations and make it real with AI, technology and people. With our strong heritage of nearly 60 years, we are a responsible and diverse group of 420,000 team members in more than 50 countries. We deliver end-to-end services and solutions with our deep industry expertise and strong partner ecosystem, leveraging our capabilities across strategy, technology, design, engineering and business operations. The Group reported 2024 global revenues of €22.1 billion.
Make it real | www.capgemini.com

Ref. code
403925-en_GB
Posted on
29 Jan 2026
Experience level
Experienced Professionals
Contract type
Permanent
Location
Melbourne, Sydney
Business unit
I and D Global Business Line
Brand
Capgemini
Professional communities
Data & AI

"""

In [56]:
def chunk_text(text: str) -> list[str]:
    regex = re.compile(r"[\n:.]")
    chunks = re.split(regex, text)
    return chunks

In [57]:
def clean_chunks(chunks: list[str]) -> list[str]:
    cleaned_chunks = []
    for chunk in chunks:
        chunk = chunk.strip()
        if chunk:
            cleaned_chunks.append(chunk)
        
    return cleaned_chunks

In [58]:
def vectorize_chunks(chunks: list[str], vectorizer: TfidfVectorizer()) -> pd.DataFrame:
    vectorized_chunks = vectorizer.transform([chunk for chunk in chunks])
    vectorized_chunks_df = pd.DataFrame(vectorized_chunks.toarray(), columns=vectorizer.get_feature_names_out())
    return vectorized_chunks_df

In [59]:
def classify_chunks(vectorized_chunks: pd.DataFrame, rf: RandomForestClassifier()) -> list[int]:
    results = rf.predict(vectorized_chunks)
    return results

In [61]:
chunks = chunk_text(text=job_listing)

In [62]:
chunks = clean_chunks(chunks=chunks)

In [64]:
job_listing_vectorized_chunks_df = vectorize_chunks(chunks=chunks, vectorizer=vectorizer)

In [67]:
results = classify_chunks(vectorized_chunks=job_listing_vectorized_chunks_df, rf=rf)

In [70]:
decoded = encoder.inverse_transform(results.reshape(-1, 1))
decoded

array([['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['RESPONSIBILITIES'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['EXPERIENCE'],
       ['EXPERIENCE'],
       ['NOISE'],
       ['RESPONSIBILITIES'],
       ['EXPERIENCE'],
       ['NOISE'],
       ['EDUCATION'],
       ['NOISE'],
       ['NOISE'],
       ['SKILLS'],
       ['SKILLS'],
       ['EXPERIENCE'],
       ['SKILLS'],
       ['EDUCATION'],
       ['EDUCATION'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       ['NOISE'],
       