# Library python

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
job_data = pd.read_csv("job_recommendation_dataset.csv")

In [3]:
job_data.head(5)

Unnamed: 0,Job Title,Company,Location,Experience Level,Salary,Industry,Required Skills
0,Early years teacher,Richardson Ltd,Sydney,Senior Level,87000.0,Healthcare,Pharmaceuticals
1,Counselling psychologist,"Ramos, Santiago and Stewart",San Francisco,Mid Level,50000.0,Marketing,"Google Ads, SEO, Content Writing"
2,Radio broadcast assistant,Franco Group,New York,Mid Level,77000.0,Healthcare,"Patient Care, Nursing, Medical Research, Pharm..."
3,"Designer, exhibition/display",Collins Inc,Berlin,Senior Level,90000.0,Software,Machine Learning
4,"Psychotherapist, dance movement",Barker Group,Sydney,Entry Level,112000.0,Healthcare,"Nursing, Medical Research, Pharmaceuticals"


In [4]:
job_data.sample(1)

Unnamed: 0,Job Title,Company,Location,Experience Level,Salary,Industry,Required Skills
897,Volunteer coordinator,Smith-Martinez,Sydney,Senior Level,42000.0,Software,"AWS, React, C++, Python"


# Basice Check

In [5]:
job_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Job Title         50000 non-null  object 
 1   Company           50000 non-null  object 
 2   Location          50000 non-null  object 
 3   Experience Level  50000 non-null  object 
 4   Salary            50000 non-null  float64
 5   Industry          50000 non-null  object 
 6   Required Skills   50000 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.7+ MB


In [6]:
job_data.describe()

Unnamed: 0,Salary
count,50000.0
mean,95145.1
std,31782.635648
min,40000.0
25%,68000.0
50%,95000.0
75%,123000.0
max,150000.0


In [7]:
job_data.isna().sum()

Job Title           0
Company             0
Location            0
Experience Level    0
Salary              0
Industry            0
Required Skills     0
dtype: int64

In [8]:
job_data['Job Title']

0                             Early years teacher
1                        Counselling psychologist
2                       Radio broadcast assistant
3                    Designer, exhibition/display
4                 Psychotherapist, dance movement
                           ...                   
49995                          Veterinary surgeon
49996                         Mental health nurse
49997                        Field trials officer
49998    Geographical information systems officer
49999                      Psychotherapist, child
Name: Job Title, Length: 50000, dtype: object

In [9]:
job_data['Industry'].value_counts()

Industry
Software         7302
Manufacturing    7169
Marketing        7158
Education        7144
Retail           7106
Healthcare       7104
Finance          7017
Name: count, dtype: int64

In [10]:
job_data['Experience Level'].value_counts()

Experience Level
Mid Level       16739
Senior Level    16658
Entry Level     16603
Name: count, dtype: int64

In [11]:
job_data['Required Skills'].value_counts()

Required Skills
Python                                                679
Sales, Merchandising, Customer Service                640
Merchandising, Sales, Customer Service                637
Supply Chain, Production Planning, Quality Control    619
Quality Control, Supply Chain, Production Planning    618
                                                     ... 
React, C++, AWS, Java                                   1
Java, AWS, Machine Learning, React                      1
React, C++, Java, Machine Learning                      1
Python, Java, React, AWS                                1
Machine Learning, Python, C++, AWS                      1
Name: count, Length: 1559, dtype: int64

In [12]:
job_data['Company'].value_counts()

Company
Smith PLC                        70
Smith Inc                        69
Smith and Sons                   65
Smith Group                      64
Smith Ltd                        54
                                 ..
Sampson Ltd                       1
Garcia, Frederick and Chase       1
Schultz, Hutchinson and Scott     1
Thompson, Jarvis and Campbell     1
Reilly, Anderson and Marsh        1
Name: count, Length: 37022, dtype: int64

In [13]:
job_data['Location'].value_counts()

Location
Toronto          7229
London           7223
New York         7167
Sydney           7161
San Francisco    7120
Bangalore        7052
Berlin           7048
Name: count, dtype: int64

# Preprocessing Data

In [14]:
import re

In [15]:
job_data["Job Title"] = job_data["Job Title"].str.replace(r'[^\w\s]', "_", regex=True)

In [16]:
def test_space(text):
    text = str(text).strip()
    text = re.sub(r"[^\w\s]", "_", text)
    text = re.sub(r"\s+", "_", text)
    return text

In [17]:
job_data['Company'] = job_data["Company"].apply(test_space)

In [18]:
job_data['Location'] = job_data['Location'].apply(test_space)

In [19]:
job_data['Job Title'] = job_data['Job Title'].apply(test_space)

In [20]:
job_data['Required Skills'] = job_data['Required Skills'].apply(test_space)

In [21]:
job_data.head()

Unnamed: 0,Job Title,Company,Location,Experience Level,Salary,Industry,Required Skills
0,Early_years_teacher,Richardson_Ltd,Sydney,Senior Level,87000.0,Healthcare,Pharmaceuticals
1,Counselling_psychologist,Ramos__Santiago_and_Stewart,San_Francisco,Mid Level,50000.0,Marketing,Google_Ads__SEO__Content_Writing
2,Radio_broadcast_assistant,Franco_Group,New_York,Mid Level,77000.0,Healthcare,Patient_Care__Nursing__Medical_Research__Pharm...
3,Designer__exhibition_display,Collins_Inc,Berlin,Senior Level,90000.0,Software,Machine_Learning
4,Psychotherapist__dance_movement,Barker_Group,Sydney,Entry Level,112000.0,Healthcare,Nursing__Medical_Research__Pharmaceuticals


In [22]:
job_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Job Title         50000 non-null  object 
 1   Company           50000 non-null  object 
 2   Location          50000 non-null  object 
 3   Experience Level  50000 non-null  object 
 4   Salary            50000 non-null  float64
 5   Industry          50000 non-null  object 
 6   Required Skills   50000 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.7+ MB


# Train Test split

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(job_data.drop(columns=['Salary']),job_data['Salary'],
                                                test_size=0.2)

In [24]:
X_train

Unnamed: 0,Job Title,Company,Location,Experience Level,Industry,Required Skills
23086,Scientist__clinical__histocompatibility_and_im...,Fry_Porter,Bangalore,Senior Level,Marketing,SEO
7719,Metallurgist,Hall_LLC,New_York,Mid Level,Education,EdTech__Curriculum_Design__Research
25795,Loss_adjuster__chartered,Roy_and_Sons,New_York,Entry Level,Finance,SQL__Risk_Analysis
7953,Clinical_scientist__histocompatibility_and_imm...,Simmons_Morgan,New_York,Entry Level,Software,React
8178,Facilities_manager,Silva_Ltd,London,Entry Level,Finance,Financial_Modeling__SQL__Python__Excel
...,...,...,...,...,...,...
4703,Education_officer__environmental,Barrett_LLC,Berlin,Entry Level,Education,EdTech__Teaching
43759,Astronomer,Acevedo_Inc,Sydney,Mid Level,Healthcare,Nursing
25869,Teacher__English_as_a_foreign_language,Heath_Willis,Berlin,Senior Level,Education,EdTech__Curriculum_Design
11913,Health_and_safety_adviser,Freeman__Thompson_and_Lee,Toronto,Senior Level,Manufacturing,Supply_Chain__Quality_Control__Production_Plan...


In [25]:
y_train

23086     62000.0
7719     109000.0
25795    132000.0
7953     145000.0
8178      75000.0
           ...   
4703      91000.0
43759    126000.0
25869     48000.0
11913    142000.0
8671      64000.0
Name: Salary, Length: 40000, dtype: float64

# Encoding

In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder

# JOb_Title

In [27]:
# trf1 = 

# Company

In [28]:
# trf2 = 

# one hot encoding for location, Industry

In [29]:
trf3 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=True,handle_unknown='ignore'),[2,4])
],remainder='passthrough')

# Experince Level for OrdinalEncoder 

In [35]:
trf4 = ColumnTransformer(transformers=[
    ('tnf2',OrdinalEncoder(categories=[['Entry Level','Mid Level','Senior Level']]),['Experience Level']),
],remainder='passthrough')

# Required Skills

In [None]:
# trf5 = 