In [116]:
import pandas as pd
import gc
import numpy as np
import re
import torch
from tqdm import tqdm
import time

from transformers import BertModel, BertTokenizer

In [2]:
postings = pd.read_csv('Linkedin_job_postings/postings.csv')

companies = pd.read_csv('Linkedin_job_postings/companies/companies.csv')
company_industries = pd.read_csv('Linkedin_job_postings/companies/company_industries.csv')
company_specialities = pd.read_csv('Linkedin_job_postings/companies/company_specialities.csv')

job_industries = pd.read_csv('Linkedin_job_postings/jobs/job_industries.csv')
job_skills = pd.read_csv('Linkedin_job_postings/jobs/job_skills.csv')

industry_mapping = pd.read_csv('Linkedin_job_postings/mappings/industries.csv')
skills_mapping = pd.read_csv('Linkedin_job_postings/mappings/skills.csv')



Optimizing postings dataframe


In [3]:
# Postings contains the information of all job openings, which will need later in this exercise
postings.head(2) 

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,1715990000000.0,,,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,1715450000000.0,,,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY


In [4]:
# Removing irrelevant columns before merging with other dataframes
postings_df = postings[['job_id','company_name','title','description','skills_desc']]

In [5]:
postings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   job_id        123849 non-null  int64 
 1   company_name  122130 non-null  object
 2   title         123849 non-null  object
 3   description   123842 non-null  object
 4   skills_desc   2439 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.7+ MB


In [6]:
postings_df.size

619245

In [7]:
postings_df.job_id = postings_df.job_id.astype('int32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  postings_df.job_id = postings_df.job_id.astype('int32')


In [9]:
postings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   job_id        123849 non-null  int32 
 1   company_name  122130 non-null  object
 2   title         123849 non-null  object
 3   description   123842 non-null  object
 4   skills_desc   2439 non-null    object
dtypes: int32(1), object(4)
memory usage: 4.3+ MB


Optimising companies dataframe

In [10]:
companies.head(2)

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url
0,1009,IBM,"At IBM, we do more than work. We create. We cr...",7.0,NY,US,"Armonk, New York",10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,7.0,0,US,Chicago,0,-,https://www.linkedin.com/company/gehealthcare


In [11]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24473 entries, 0 to 24472
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   company_id    24473 non-null  int64  
 1   name          24472 non-null  object 
 2   description   24176 non-null  object 
 3   company_size  21699 non-null  float64
 4   state         24451 non-null  object 
 5   country       24473 non-null  object 
 6   city          24472 non-null  object 
 7   zip_code      24445 non-null  object 
 8   address       24451 non-null  object 
 9   url           24473 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 1.9+ MB


In [12]:
companies = companies[['company_id','name', 'description']]

In [13]:
companies.company_id.max()

103472979

In [14]:
companies.company_id = companies.company_id.astype('int32')

In [15]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24473 entries, 0 to 24472
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   company_id   24473 non-null  int32 
 1   name         24472 non-null  object
 2   description  24176 non-null  object
dtypes: int32(1), object(2)
memory usage: 478.1+ KB


Optimizing company_industries and company_specialities

In [16]:
company_industries.head(2)

Unnamed: 0,company_id,industry
0,391906,Book and Periodical Publishing
1,22292832,Construction


In [17]:
company_industries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24375 entries, 0 to 24374
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   company_id  24375 non-null  int64 
 1   industry    24375 non-null  object
dtypes: int64(1), object(1)
memory usage: 381.0+ KB


In [19]:
company_industries.company_id.max()

103472979

In [20]:
company_industries.company_id = company_industries.company_id.astype('int32')

In [21]:
company_industries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24375 entries, 0 to 24374
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   company_id  24375 non-null  int32 
 1   industry    24375 non-null  object
dtypes: int32(1), object(1)
memory usage: 285.8+ KB


Merging companies and company_industries

In [22]:
company_industries = companies.merge(company_industries,how='left',on='company_id')

In [23]:
del companies
gc.collect()

0

In [24]:
company_industries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24483 entries, 0 to 24482
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   company_id   24483 non-null  int32 
 1   name         24482 non-null  object
 2   description  24186 non-null  object
 3   industry     24375 non-null  object
dtypes: int32(1), object(3)
memory usage: 669.6+ KB


In [25]:
company_specialities.head(2)

Unnamed: 0,company_id,speciality
0,22292832,window replacement
1,22292832,patio door replacement


In [26]:
company_specialities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169387 entries, 0 to 169386
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   company_id  169387 non-null  int64 
 1   speciality  169387 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.6+ MB


In [27]:
company_specialities.company_id.max()

103458790

In [28]:
company_specialities.company_id = company_specialities.company_id.astype('int32')

In [29]:
company_specialities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169387 entries, 0 to 169386
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   company_id  169387 non-null  int32 
 1   speciality  169387 non-null  object
dtypes: int32(1), object(1)
memory usage: 1.9+ MB


Merging company_industries and company_specialities

In [30]:
company_merged = company_industries.merge(company_specialities, how= 'left', on='company_id')

In [31]:
del company_industries, company_specialities
gc.collect()

0

In [32]:
company_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176210 entries, 0 to 176209
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   company_id   176210 non-null  int32 
 1   name         176209 non-null  object
 2   description  175907 non-null  object
 3   industry     176102 non-null  object
 4   speciality   169516 non-null  object
dtypes: int32(1), object(4)
memory usage: 6.0+ MB


Optimizing job_industries and job_skills

In [33]:
job_industries.head(2)

Unnamed: 0,job_id,industry_id
0,3884428798,82
1,3887473071,48


In [34]:
job_industries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164808 entries, 0 to 164807
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   job_id       164808 non-null  int64
 1   industry_id  164808 non-null  int64
dtypes: int64(2)
memory usage: 2.5 MB


In [35]:
job_industries.job_id.max(),job_industries.industry_id.max()

(3906267224, 3253)

In [36]:
job_industries.job_id = job_industries.job_id.astype('int32')
job_industries.industry_id = job_industries.industry_id.astype('int32')

In [38]:
job_skills.head(2)

Unnamed: 0,job_id,skill_abr
0,3884428798,MRKT
1,3884428798,PR


In [39]:
job_skills.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213768 entries, 0 to 213767
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   job_id     213768 non-null  int64 
 1   skill_abr  213768 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.3+ MB


In [40]:
job_skills.job_id.max()

3906267224

In [41]:
job_skills.job_id = job_skills.job_id.astype('int32')

In [42]:
job_skills.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213768 entries, 0 to 213767
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   job_id     213768 non-null  int32 
 1   skill_abr  213768 non-null  object
dtypes: int32(1), object(1)
memory usage: 2.4+ MB


Merging job_industries and job_skills

In [43]:
#Include only job_ids with skill_description
job_merged = job_industries.merge(job_skills,how='inner',on='job_id')

In [44]:
del job_industries, job_skills
gc.collect()

0

In [110]:
job_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286885 entries, 0 to 286884
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   job_id       286885 non-null  int32 
 1   industry_id  286885 non-null  int32 
 2   skill_abr    286885 non-null  string
dtypes: int32(2), string(1)
memory usage: 4.4 MB


Optimizing industry_mapping  and skills_mapping dataframes

In [45]:
industry_mapping.head(2)

Unnamed: 0,industry_id,industry_name
0,1,Defense and Space Manufacturing
1,3,Computer Hardware Manufacturing


In [46]:
industry_mapping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   industry_id    422 non-null    int64 
 1   industry_name  388 non-null    object
dtypes: int64(1), object(1)
memory usage: 6.7+ KB


In [47]:
industry_mapping.describe()

Unnamed: 0,industry_id
count,422.0
mean,1342.305687
std,1212.022551
min,1.0
25%,108.25
50%,1161.5
75%,2279.5
max,3253.0


In [48]:
industry_mapping.industry_id = industry_mapping.industry_id.astype('int32')

In [49]:
industry_mapping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   industry_id    422 non-null    int32 
 1   industry_name  388 non-null    object
dtypes: int32(1), object(1)
memory usage: 5.1+ KB


In [50]:
skills_mapping.head(2)

Unnamed: 0,skill_abr,skill_name
0,ART,Art/Creative
1,DSGN,Design


In [51]:
skills_mapping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   skill_abr   35 non-null     object
 1   skill_name  35 non-null     object
dtypes: object(2)
memory usage: 688.0+ bytes


In [52]:
job_merged = job_merged.merge(industry_mapping,how='inner',on='industry_id')

In [53]:
job_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286885 entries, 0 to 286884
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   job_id         286885 non-null  int32 
 1   industry_id    286885 non-null  int32 
 2   skill_abr      286885 non-null  object
 3   industry_name  286724 non-null  object
dtypes: int32(2), object(2)
memory usage: 6.6+ MB


In [54]:
postings_df = postings_df.merge(job_merged,how='left',on='job_id')


In [58]:
del job_merged
gc.collect()

0

In [56]:
postings_df = postings_df.drop(['skill_abr','industry_id'],axis=1)

In [57]:
postings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279762 entries, 0 to 279761
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   job_id         279762 non-null  int32 
 1   company_name   277271 non-null  object
 2   title          279762 non-null  object
 3   description    279750 non-null  object
 4   skills_desc    4504 non-null    object
 5   industry_name  276626 non-null  object
dtypes: int32(1), object(5)
memory usage: 11.7+ MB


In [130]:
gc.collect()

0

In [59]:
company_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176210 entries, 0 to 176209
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   company_id   176210 non-null  int32 
 1   name         176209 non-null  object
 2   description  175907 non-null  object
 3   industry     176102 non-null  object
 4   speciality   169516 non-null  object
dtypes: int32(1), object(4)
memory usage: 6.0+ MB


In [61]:
company_merged.industry = company_merged.industry.replace({'<NA>': float('nan'), pd.NA: float('nan')})
company_merged.name = company_merged.name.replace({'<NA>': float('nan'), pd.NA: float('nan')})

In [62]:
postings_df.head(10)

Unnamed: 0,job_id,company_name,title,description,skills_desc,industry_name
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,Requirements: \n\nWe are seeking a College or ...,Real Estate
1,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,Requirements: \n\nWe are seeking a College or ...,Real Estate
2,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",,
3,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,We are currently accepting resumes for FOH - A...,Restaurants
4,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,We are currently accepting resumes for FOH - A...,Restaurants
5,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,This position requires a baseline understandin...,Law Practice
6,35982263,,Service Technician,Looking for HVAC service tech with experience ...,,Facilities Services
7,91700727,Downtown Raleigh Alliance,Economic Development and Planning Intern,Job summary:The Economic Development & Plannin...,,Non-profit Organization Management
8,103254301,Raw Cereal,Producer,Company DescriptionRaw Cereal is a creative de...,,Design Services
9,103254301,Raw Cereal,Producer,Company DescriptionRaw Cereal is a creative de...,,Design Services


In [63]:

postings_df.company_name = postings_df.company_name.replace({'<NA>': float('nan'), pd.NA: float('nan')})
postings_df.industry_name = postings_df.industry_name.replace({'<NA>': float('nan'), pd.NA: float('nan')})


In [64]:
postings_df.title= postings_df.title.fillna('') 
postings_df.description = postings_df.description.fillna('') 
postings_df.skills_desc = postings_df.skills_desc.fillna('') 

In [65]:
postings_df['job_features'] = postings_df[['title', 'description', 'skills_desc']].agg('_'.join, axis=1)

In [66]:
postings_df = postings_df[['job_id','company_name','industry_name','job_features']]

In [68]:
postings_df.head()

Unnamed: 0,job_id,company_name,industry_name,job_features
0,921716,Corcoran Sawyer Smith,Real Estate,Marketing Coordinator_Job descriptionA leading...
1,921716,Corcoran Sawyer Smith,Real Estate,Marketing Coordinator_Job descriptionA leading...
2,1829192,,,Mental Health Therapist/Counselor_At Aspen The...
3,10998357,The National Exemplar,Restaurants,Assitant Restaurant Manager_The National Exemp...
4,10998357,The National Exemplar,Restaurants,Assitant Restaurant Manager_The National Exemp...


In [72]:
company_merged = company_merged.drop(['company_id'],axis=1)

In [73]:
company_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176210 entries, 0 to 176209
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   name         176209 non-null  object
 1   description  175907 non-null  object
 2   industry     176102 non-null  object
 3   speciality   169516 non-null  object
dtypes: object(4)
memory usage: 5.4+ MB


In [74]:
company_merged.description = company_merged.description.fillna('') 
company_merged.speciality = company_merged.speciality.fillna('') 


company_merged['company_features'] = company_merged[['description', 'speciality']].agg('_'.join, axis=1)

In [77]:
company_merged = company_merged.drop(['description', 'speciality'],axis=1)
company_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176210 entries, 0 to 176209
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   name              176209 non-null  object
 1   industry          176102 non-null  object
 2   company_features  176210 non-null  object
dtypes: object(3)
memory usage: 4.0+ MB


In [86]:
company_merged.name = company_merged.name.str.lower()
company_merged.industry = company_merged.industry.str.lower()

postings_df.company_name = postings_df.company_name.str.lower()
postings_df.industry_name = postings_df.industry_name.str.lower()

In [92]:
def preprocess_key(key):
    if pd.isna(key):
        return np.nan
    # Convert to string, strip spaces, convert to lower case, and remove special characters
    key = str(key).strip().lower()
    key = re.sub(r'[^a-zA-Z0-9]', '', key)  # Remove non-alphanumeric characters
    return key


company_merged.name = company_merged.name.apply(preprocess_key)
company_merged.industry = company_merged.industry.apply(preprocess_key)
postings_df.name = postings_df.name.apply(preprocess_key)
company_merged.name = company_merged.name.apply(preprocess_key)

In [93]:
import sqlite3

In [94]:
conn = sqlite3.connect('job_postings.db')

In [95]:
postings_df.to_sql('jobs', conn, index=False, if_exists='replace')
company_merged.to_sql('companies', conn, index=False, if_exists='replace')

176210

In [99]:
postings_df.info(),company_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279762 entries, 0 to 279761
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   job_id         279762 non-null  int32 
 1   company_name   277271 non-null  object
 2   industry_name  276626 non-null  object
 3   job_features   279762 non-null  object
dtypes: int32(1), object(3)
memory usage: 7.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176210 entries, 0 to 176209
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   name              176209 non-null  object
 1   industry          176209 non-null  object
 2   company_features  176210 non-null  object
dtypes: object(3)
memory usage: 4.0+ MB


(None, None)

In [100]:
query = '''
SELECT jobs.job_id, jobs.job_features,companies.name, companies.industry,companies.company_features
FROM jobs
LEFT JOIN companies ON jobs.industry_name = companies.industry
and jobs.company_name = companies.name
'''

In [101]:
merged_df = pd.read_sql_query(query, conn)

In [102]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279762 entries, 0 to 279761
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   job_id            279762 non-null  int64 
 1   job_features      279762 non-null  object
 2   name              1 non-null       object
 3   industry          1 non-null       object
 4   company_features  1 non-null       object
dtypes: int64(1), object(4)
memory usage: 10.7+ MB


In [104]:
merged_df['company_features'] = merged_df['name'] + merged_df['industry']
merged_df = merged_df.drop(['name','industry'],axis=1)

In [106]:
merged_df['features'] = merged_df['job_features'] + merged_df['company_features']
merged_df = merged_df[['job_id','features']]

In [107]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279762 entries, 0 to 279761
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   job_id    279762 non-null  int64 
 1   features  1 non-null       object
dtypes: int64(1), object(1)
memory usage: 4.3+ MB


In [110]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Using cached safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[

In [112]:
model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = True,)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [113]:
def get_bert_embeddings(text):
    # Tokenize the text and get input IDs and attention masks
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    # Get the hidden states from BERT model
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Use the last hidden state as the embedding
    last_hidden_state = outputs.last_hidden_state
    embeddings = last_hidden_state.mean(dim=1)  # Mean pooling to get fixed-size representation
    
    return embeddings.squeeze().numpy()

In [120]:
merged_df['features'] = merged_df['features'].astype('str')
merged_df['features'] = merged_df['features'].apply(lambda x: get_bert_embeddings(x))

AttributeError: 'Series' object has no attribute 'itertuples'

In [None]:
sample_string = "machine learning engineer"
sample_embedding = get_bert_embeddings(sample_string)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:

def calculate_similarity(embedding1, embedding2):
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)
    return cosine_similarity(embedding1, embedding2)[0][0]

# Calculate similarity scores
merged_df['similarity_score'] = merged_df['features'].apply(lambda x: calculate_similarity(x, sample_embedding))


In [None]:
merged_df[['job_id', 'similarity_score']].sort_values(by=['similarity_score'],ascending=False).head(10)