In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [2]:
!pip install pycountry

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Python313\python.exe -m pip install --upgrade pip


In [3]:
datadf = pd.read_csv("ai-jobs-net-salaries.csv")


In [4]:
datadf.head(10)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2025,MI,FT,Research Engineer,55000,EUR,57894,BE,50,BE,L
1,2025,EN,FT,Data Analyst,147000,USD,147000,US,0,US,M
2,2025,EN,FT,Data Analyst,60900,USD,60900,US,0,US,M
3,2025,SE,FT,Software Engineer,303500,USD,303500,US,0,US,M
4,2025,SE,FT,Software Engineer,159800,USD,159800,US,0,US,M
5,2025,SE,FT,Developer,135000,USD,135000,US,0,US,M
6,2025,SE,FT,Developer,95000,USD,95000,US,0,US,M
7,2025,SE,FT,Engineer,300000,USD,300000,US,0,US,M
8,2025,SE,FT,Engineer,70000,USD,70000,US,0,US,M
9,2025,MI,FT,Machine Learning Engineer,355000,USD,355000,US,0,US,M


In [5]:
datadf.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [6]:
#replacing the values to expand for better understanding
datadf['experience_level'] = datadf['experience_level'].replace({
    'EN': 'Entry-level',
    'MI': 'Mid-level',
    'SE': 'Senior-level',
    'EX': 'Executive-level'
})

In [7]:
datadf['experience_level'].unique()

array(['Mid-level', 'Entry-level', 'Senior-level', 'Executive-level'],
      dtype=object)

In [8]:

datadf['employment_type'].unique()


array(['FT', 'CT', 'PT', 'FL'], dtype=object)

In [9]:
#replacing the values to expand for better understanding
datadf['employment_type'] = datadf['employment_type'].replace({
    'FT': 'Full-Time',
    'CT': 'Contract',
    'PT': 'Part-Time',
    'FL': 'Freelance'
})

In [10]:
datadf['employment_type'].unique()


array(['Full-Time', 'Contract', 'Part-Time', 'Freelance'], dtype=object)

In [11]:
datadf['experience_level'].unique()

array(['Mid-level', 'Entry-level', 'Senior-level', 'Executive-level'],
      dtype=object)

In [12]:

datadf['remote_ratio'].unique()

array([ 50,   0, 100], dtype=int64)

In [13]:
datadf = datadf.rename(columns={'remote_ratio': 'work_setting'})


In [14]:
#maping the work setting labels
datadf['work_setting'] = datadf['work_setting'].map({
    0: 'In-person',
    50: 'Hybrid',
    100: 'Remote'
})

In [15]:
datadf['company_location'].unique()


array(['BE', 'US', 'CA', 'GB', 'ZA', 'AT', 'PL', 'NO', 'FI', 'LT', 'DE',
       'NL', 'PT', 'EE', 'AU', 'IE', 'SK', 'PH', 'FR', 'EG', 'MX', 'JO',
       'ES', 'BR', 'CY', 'LV', 'NZ', 'AR', 'CO', 'UA', 'CH', 'TH', 'JM',
       'JP', 'MT', 'MK', 'SI', 'HK', 'LS', 'IN', 'PE', 'SG', 'IT', 'HU',
       'RO', 'PA', 'LU', 'DZ', 'CL', 'GR', 'KE', 'CD', 'SE', 'KR', 'TW',
       'CZ', 'TR', 'DK', 'AE', 'BG', 'ID', 'RS', 'PR', 'SV', 'EC', 'DO',
       'MY', 'XK', 'CR', 'ZM', 'AM', 'IL', 'LB', 'NG', 'HR', 'PK', 'HN',
       'VE', 'AS', 'SA', 'OM', 'BA', 'VN', 'GI', 'MU', 'RU', 'QA', 'GH',
       'AD', 'CF', 'IR', 'BS', 'IQ', 'CN', 'MD'], dtype=object)

In [26]:
import pycountry

#creating a function to capture full country names from the 2 letter 

def get_countryNames(code):
    try:
        return pycountry.countries.get(alpha_2=code).name
    except:
        return code


cleanup_mapping = {
    'USA': 'United States of America',
    'UK': 'United Kingdom',
    'Venezuela, Bolivarian Republic of': 'Venezuela',
    'Iran, Islamic Republic of': 'Iran',
    'Bolivia, Plurinational State of': 'Bolivia',
    'Korea, Republic of': 'South Korea',
    'Russian Federation': 'Russia',
    'Taiwan, Province of China': 'Taiwan',
    'United States': 'United States of America',
    'United Kingdom': 'United Kingdom',
    'Viet Nam': 'Vietnam',
    'Syrian Arab Republic': 'Syria',
    'Czechia': 'Czech Republic',
    'XX': 'Unknown'
    # Add more as needed
}


datadf['company_location'] = datadf['company_location'].apply(get_countryNames)

datadf['company_location'] = datadf['company_location'].replace(cleanup_mapping)

In [27]:
datadf['company_location'].unique()

array(['Belgium', 'United States of America', 'Canada', 'United Kingdom',
       'South Africa', 'Austria', 'Poland', 'Norway', 'Finland',
       'Lithuania', 'Germany', 'Netherlands', 'Portugal', 'Estonia',
       'Australia', 'Ireland', 'Slovakia', 'Philippines', 'France',
       'Egypt', 'Mexico', 'Jordan', 'Spain', 'Brazil', 'Cyprus', 'Latvia',
       'New Zealand', 'Argentina', 'Colombia', 'Ukraine', 'Switzerland',
       'Thailand', 'Jamaica', 'Japan', 'Malta', 'North Macedonia',
       'Slovenia', 'Hong Kong', 'Lesotho', 'India', 'Peru', 'Singapore',
       'Italy', 'Hungary', 'Romania', 'Panama', 'Luxembourg', 'Algeria',
       'Chile', 'Greece', 'Kenya',
       'Congo, The Democratic Republic of the', 'Sweden', 'South Korea',
       'Taiwan', 'Czech Republic', 'Türkiye', 'Denmark',
       'United Arab Emirates', 'Bulgaria', 'Indonesia', 'Serbia',
       'Puerto Rico', 'El Salvador', 'Ecuador', 'Dominican Republic',
       'Malaysia', 'XK', 'Costa Rica', 'Zambia', 'Armenia', 'I

In [28]:
datadf['employee_residence'].unique()

array(['Belgium', 'USA', 'Canada', 'UK', 'South Africa', 'Austria',
       'Poland', 'Norway', 'Finland', 'Lithuania', 'Germany',
       'Netherlands', 'Portugal', 'Estonia', 'Australia', 'Ireland',
       'Slovakia', 'Philippines', 'France', 'Egypt', 'Mexico', 'Spain',
       'Jordan', 'Brazil', 'Cyprus', 'Latvia', 'New Zealand', 'Argentina',
       'Colombia', 'Ukraine', 'Switzerland', 'Thailand', 'Jamaica',
       'Japan', 'Malta', 'Italy', 'North Macedonia', 'Slovenia',
       'Hong Kong', 'Lesotho', 'India', 'Indonesia', 'Peru', 'Singapore',
       'Hungary', 'Romania', 'Panama', 'Luxembourg', 'Algeria', 'Chile',
       'Greece', 'Kenya', 'Congo, The Democratic Republic of the',
       'Sweden', 'South Korea', 'Taiwan', 'Czech Republic', 'Türkiye',
       'Nigeria', 'Denmark', 'United Arab Emirates', 'Bulgaria', 'Serbia',
       'Puerto Rico', 'El Salvador', 'Ecuador', 'Dominican Republic',
       'Malaysia', 'XK', 'Costa Rica', 'Zambia', 'Armenia', 'Rwanda',
       'Israel', 'Leb

In [29]:
datadf['employee_residence'] = datadf['employee_residence'].apply(get_countryNames)
datadf['employee_residence'] = datadf['employee_residence'].replace(cleanup_mapping)

In [30]:
datadf['employee_residence'].unique()


array(['Belgium', 'United States of America', 'Canada', 'United Kingdom',
       'South Africa', 'Austria', 'Poland', 'Norway', 'Finland',
       'Lithuania', 'Germany', 'Netherlands', 'Portugal', 'Estonia',
       'Australia', 'Ireland', 'Slovakia', 'Philippines', 'France',
       'Egypt', 'Mexico', 'Spain', 'Jordan', 'Brazil', 'Cyprus', 'Latvia',
       'New Zealand', 'Argentina', 'Colombia', 'Ukraine', 'Switzerland',
       'Thailand', 'Jamaica', 'Japan', 'Malta', 'Italy',
       'North Macedonia', 'Slovenia', 'Hong Kong', 'Lesotho', 'India',
       'Indonesia', 'Peru', 'Singapore', 'Hungary', 'Romania', 'Panama',
       'Luxembourg', 'Algeria', 'Chile', 'Greece', 'Kenya',
       'Congo, The Democratic Republic of the', 'Sweden', 'South Korea',
       'Taiwan', 'Czech Republic', 'Türkiye', 'Nigeria', 'Denmark',
       'United Arab Emirates', 'Bulgaria', 'Serbia', 'Puerto Rico',
       'El Salvador', 'Ecuador', 'Dominican Republic', 'Malaysia', 'XK',
       'Costa Rica', 'Zambia', 'A

In [32]:
datadf['company_size'].unique()

array(['L', 'M', 'S'], dtype=object)

In [33]:
#replacing the values to expand for better understanding
datadf['company_size'] = datadf['company_size'].replace({
    'L': 'Large',
    'M': 'Medium',
    'S': 'Small'
})

In [34]:
datadf.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,work_setting,company_location,company_size
0,2025,Mid-level,Full-Time,Research Engineer,55000,EUR,57894,Belgium,Hybrid,Belgium,Large
1,2025,Entry-level,Full-Time,Data Analyst,147000,USD,147000,United States of America,In-person,United States of America,Medium
2,2025,Entry-level,Full-Time,Data Analyst,60900,USD,60900,United States of America,In-person,United States of America,Medium
3,2025,Senior-level,Full-Time,Software Engineer,303500,USD,303500,United States of America,In-person,United States of America,Medium
4,2025,Senior-level,Full-Time,Software Engineer,159800,USD,159800,United States of America,In-person,United States of America,Medium
