## 1. Load and Preprocess the Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report

## Load Data


In [None]:
# Load the dataset (replace with your file path)
df = pd.read_excel("Ask A Manager Salary Survey 2021 (Responses).xlsx", sheet_name="Form Responses 1")

## Clean and Prepare Data

In [None]:
# Select relevant columns
df = df[['industry', 'job title', 'annual salary', 'country', 'highest level of education completed', 'overall years of professional experience']]

# Convert salary to categorical bins
bins = [0, 50000, 100000, float('inf')]
labels = ['low', 'medium', 'high']
df['salary_category'] = pd.cut(df['annual salary'], bins=bins, labels=labels)

# Drop rows with missing values
df.dropna(inplace=True)

# Simplify job titles (example grouping)
df['job_title_grouped'] = df['job title'].str.replace(r'(I|II|III|Senior|Junior|Lead|Manager)', '', regex=True).str.strip()

## 2. Feature Engineering

In [None]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Encode features
encoded_features = encoder.fit_transform(df[['industry', 'country', 'highest level of education completed', 'job_title_grouped']])
feature_names = encoder.get_feature_names_out(['industry', 'country', 'education', 'job_title'])

## 3. Split Data into Train/Test Sets

In [None]:
X = encoded_features
y = df['salary_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 4. Train the Model

In [None]:
model = MultinomialNB(alpha=1.0)  # Laplace smoothing
model.fit(X_train, y_train)

## 5. Evaluate Performance

In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))