In [2]:
naukri_path = DATA_RAW / "naukri" / "naukri_data_scientist.csv"

skills_demand_raw = pd.read_csv(
    naukri_path,
    usecols=["jobId", "createdDate", "tagsAndSkills"]
)

skills_demand_raw.shape

(13691, 3)

In [1]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path("..")
DATA_RAW = PROJECT_ROOT / "data" / "raw"

In [3]:
skills_demand_raw.head(3)

Unnamed: 0,createdDate,jobId,tagsAndSkills
0,2024-12-23 06:51:32,10421006391,"Natural Language Processing,Phd,Deep Learning,..."
1,2024-12-23 06:33:58,231224903546,"Data Science,Tensorflow,StatModel,SAS,Go,Rando..."
2,2024-12-23 09:29:16,231224006929,"cuda,python,github,natural language processing..."


In [4]:
skills_demand_raw["createdDate"] = pd.to_datetime(
    skills_demand_raw["createdDate"], errors="coerce"
)

skills_demand_raw["year"] = skills_demand_raw["createdDate"].dt.year.astype("Int64")

(
    skills_demand_raw["year"].isna().sum(),
    skills_demand_raw["year"].min(),
    skills_demand_raw["year"].max()
)

(np.int64(0), np.int64(1970), np.int64(2024))

In [5]:
skills_demand_raw.loc[
    skills_demand_raw["year"] == 1970
].head(10)

Unnamed: 0,createdDate,jobId,tagsAndSkills,year
30,1970-01-01,111224502998,,1970
153,1970-01-01,101224500962,,1970
724,1970-01-01,170624502294,,1970
922,1970-01-01,111224503907,,1970
924,1970-01-01,121224502905,,1970
1381,1970-01-01,280324501556,,1970
1393,1970-01-01,81117500277,,1970
1456,1970-01-01,211019501420,,1970
1467,1970-01-01,140519500097,,1970
1470,1970-01-01,30624502769,,1970


In [6]:
mask_1970 = skills_demand_raw["year"] == 1970

mask_1970.sum(), skills_demand_raw.loc[mask_1970, "tagsAndSkills"].isna().sum()

(np.int64(56), np.int64(56))

In [7]:
skills_to_explode = skills_demand_raw.dropna(subset=["tagsAndSkills"]).copy()

skills_to_explode.shape

(13635, 4)

In [8]:
skills_to_explode["tagsAndSkills"].head(5)

0    Natural Language Processing,Phd,Deep Learning,...
1    Data Science,Tensorflow,StatModel,SAS,Go,Rando...
2    cuda,python,github,natural language processing...
3    Tensorflow,Pytorch,Python,Pandas,EDA Tools,Ker...
4         Data science,data scientist,LLM,Science,Data
Name: tagsAndSkills, dtype: object

In [9]:
skills_exploded = (
    skills_to_explode
    .assign(skill=skills_to_explode["tagsAndSkills"].str.split(","))
    .explode("skill")
    .loc[:, ["jobId", "year", "skill"]]
)

skills_exploded.shape

(105427, 3)

In [10]:
skills_exploded["skill"].head(20)

0    Natural Language Processing
0                            Phd
0                  Deep Learning
0                   Data Science
0                      Languages
0                           Data
0                     Processing
0                        Science
1                   Data Science
1                     Tensorflow
1                      StatModel
1                            SAS
1                             Go
1                 Random Forests
1                            SVM
1                         Gensim
2                           cuda
2                         python
2                         github
2    natural language processing
Name: skill, dtype: object

In [11]:
skills_exploded["skill"].sample(20, random_state=42)

2978                       spark
4468             data governance
7207                      Python
4507           Business analysis
242                    Analytics
185                   Deployment
4745              data cleansing
4641                      Coding
6891             Version control
5056                      Django
2482      information technology
7550                gap analysis
9437              Data migration
224               ML Development
11635            Data processing
12959       frontend development
1340                         LLM
6718                        Data
7365     Diversity and Inclusion
6105                         SQL
Name: skill, dtype: object

In [12]:
skills_exploded["skill_norm"] = (
    skills_exploded["skill"]
    .astype("string")
    .str.strip()
    .str.lower()
)

skills_exploded[["skill", "skill_norm"]].head(10)

Unnamed: 0,skill,skill_norm
0,Natural Language Processing,natural language processing
0,Phd,phd
0,Deep Learning,deep learning
0,Data Science,data science
0,Languages,languages
0,Data,data
0,Processing,processing
0,Science,science
1,Data Science,data science
1,Tensorflow,tensorflow


In [13]:
skills_exploded["skill_norm"].nunique(), skills_exploded["skill_norm"].value_counts().head(20)

(6761,
 skill_norm
 python                2571
 machine learning      2564
 sql                   2283
 data analysis         2091
 analytical            1661
 agile                 1248
 analytics             1226
 data modeling         1221
 computer science      1087
 data                  1072
 business analysis     1043
 data engineering      1013
 project management     962
 automation             954
 data quality           944
 business analyst       917
 data science           911
 data management        782
 aws                    775
 data processing        768
 Name: count, dtype: Int64)

In [14]:
skills_unique = (
    skills_exploded
    .drop_duplicates(subset=["jobId", "year", "skill_norm"])
    .loc[:, ["jobId", "year", "skill_norm"]]
)

skills_unique.shape

(105320, 3)

In [15]:
skills_demand = (
    skills_unique
    .rename(columns={"skill_norm": "skill"})
    .sort_values(["year", "jobId", "skill"])
    .reset_index(drop=True)
)

skills_demand.shape, skills_demand.head(5)

((105320, 3),
           jobId  year              skill
 0   10615503452  2015     data scientist
 1  100915500152  2015  analytical skills
 2  100915500152  2015            bidding
 3  100915500152  2015   business analyst
 4  100915500152  2015            english)

In [16]:
out_path = PROJECT_ROOT / "data" / "processed" / "skills_demand.parquet"
out_path.parent.mkdir(parents=True, exist_ok=True)

skills_demand.to_parquet(out_path, index=False)

out_path

WindowsPath('../data/processed/skills_demand.parquet')