In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [4]:
df = pd.read_csv('/kaggle/input/data-science-job-salaries/ds_salaries.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [6]:
#checking for meassing values
df.isnull().sum()


Unnamed: 0            0
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [9]:
#use lableEncoder for simplicity 
le_exp = LabelEncoder()
le_job = LabelEncoder()
le_size = LabelEncoder()
le_loc = LabelEncoder()

df['experience_encoded'] = le_exp.fit_transform(df['experience_level'])
df['job_title_encoded'] = le_job.fit_transform(df['job_title'])
df['company_size_encoded'] = le_size.fit_transform(df['company_size'])
df['company_location_encoded'] = le_loc.fit_transform(df['company_location'])


In [10]:
#select features and target 
X = df[['experience_encoded', 'job_title_encoded', 'company_size_encoded', 'company_location_encoded']]
y = df['salary_in_usd']


In [12]:
#split data into treaning and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [13]:
#trained random forest modle 
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [14]:
y_pred = model.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


Mean Squared Error: 2015279847.5208519
R2 Score: 0.4741700006782966


In [24]:
def predict_salary_from_input():
    # Take input from user
    exp_input = input("Enter Experience Level (e.g., EN, MI, SE, EX): ").strip().upper()
    job_input = input("Enter Job Title (e.g., Data Scientist, ML Engineer): ").strip()
    size_input = input("Enter Company Size (S, M, L): ").strip().upper()
    loc_input = input("Enter Company Location (e.g., US, DE, IN): ").strip().upper()
    
    # Transform inputs
    try:
        input_data = pd.DataFrame([{
            'experience_encoded': le_exp.transform([exp_input])[0],
            'job_title_encoded': le_job.transform([job_input])[0],
            'company_size_encoded': le_size.transform([size_input])[0],
            'company_location_encoded': le_loc.transform([loc_input])[0]
        }])
        
        # Predict salary
        predicted_salary = model.predict(input_data)
        print("\n Predicted Salary in USD:", round(predicted_salary[0], 2))
    
    except ValueError as e:
        print("Invalid input. Please ensure values match the dataset.")
        print("Error:", e)


In [28]:
predict_salary_from_input()


Enter Experience Level (e.g., EN, MI, SE, EX):  MI
Enter Job Title (e.g., Data Scientist, ML Engineer):  ML Engineer 
Enter Company Size (S, M, L):  S
Enter Company Location (e.g., US, DE, IN):  US



 Predicted Salary in USD: 91558.88


In [30]:
import joblib
joblib.dump(model, 'salary_model.pkl')
print("Model saved!")


Model saved!
