In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

# Load the dataset
df = pd.read_csv("/content/ai_job_market_insights.csv")

# Drop rows with missing Salary_USD values
df = df[df["Salary_USD"].notnull()]

# Drop rows with missing values in critical columns
df = df.dropna(subset=["Job_Title", "Industry", "Company_Size", "Location", "AI_Adoption_Level", "Automation_Risk", "Required_Skills", "Remote_Friendly", "Job_Growth_Projection"])

# Encode categorical columns
categorical_cols = ["Job_Title", "Industry", "Company_Size", "Location", "AI_Adoption_Level", "Automation_Risk", "Required_Skills", "Remote_Friendly", "Job_Growth_Projection"]
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Features and target variable
X = df.drop("Salary_USD", axis=1)
y = df["Salary_USD"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train a RandomForestRegressor model
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

# Save the model and label encoders using pickle
data = {
    "model": model,
    "label_encoders": label_encoders
}

with open('trained_model.pkl', 'wb') as file:
    pickle.dump(data, file)

print("Model and label encoders saved successfully.")

# To load the model and label encoders later:
with open('trained_model.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

loaded_model = loaded_data["model"]
loaded_label_encoders = loaded_data["label_encoders"]

# Example of making a prediction with the loaded model
def preprocess_input(example_data, encoders):
    for col in example_data.columns:
        if col in encoders:
            le = encoders[col]
            # Handle unseen labels by mapping to an integer (e.g., -1 for unknown)
            example_data[col] = [le.transform([value])[0] if value in le.classes_ else -1 for value in example_data[col]]
    return example_data

example_data = pd.DataFrame({
    "Job_Title": ["Data Scientist"],
    "Industry": ["Tech"],
    "Company_Size": ["Large"],
    "Location": ["New York"],
    "AI_Adoption_Level": ["High"],
    "Automation_Risk": ["Low"],
    "Required_Skills": ["Python, Machine Learning"],
    "Remote_Friendly": ["Yes"],
    "Job_Growth_Projection": ["High"]
})

example_data = preprocess_input(example_data, loaded_label_encoders)
example_data = example_data.astype(float)

prediction = loaded_model.predict(example_data)
print(f"Predicted Salary: ${prediction[0]:,.2f}")


Model and label encoders saved successfully.
Predicted Salary: $95,134.75
