## 1. Load and Preprocess the Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report

## Load Data


In [None]:
# Load the dataset (replace with your file path)
df = pd.read_excel("Ask A Manager Salary Survey 2021 (Responses).xlsx", sheet_name="Form Responses 1")

## Clean and Prepare Data

In [None]:
# Select relevant columns
df = df[['industry', 'job title', 'annual salary', 'country', 'highest level of education completed', 'overall years of professional experience']]

# Convert salary to categorical bins
bins = [0, 50000, 100000, float('inf')]
labels = ['low', 'medium', 'high']
df['salary_category'] = pd.cut(df['annual salary'], bins=bins, labels=labels)

# Drop rows with missing values
df.dropna(inplace=True)

# Simplify job titles (example grouping)
df['job_title_grouped'] = df['job title'].str.replace(r'(I|II|III|Senior|Junior|Lead|Manager)', '', regex=True).str.strip()

## 2. Feature Engineering

In [None]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Encode features
encoded_features = encoder.fit_transform(df[['industry', 'country', 'highest level of education completed', 'job_title_grouped']])
feature_names = encoder.get_feature_names_out(['industry', 'country', 'education', 'job_title'])

## 3. Split Data into Train/Test Sets

In [None]:
X = encoded_features
y = df['salary_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 4. Train the Model

In [None]:
model = MultinomialNB(alpha=1.0)  # Laplace smoothing
model.fit(X_train, y_train)

## 5. Evaluate Performance

In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

## 6. Visualize the Results

### Import Visualization Libraries


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

### Salary Category Distribution

In [None]:
# Plot class distribution
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='salary_category', palette='viridis')
plt.title("Salary Category Distribution")
plt.xlabel("Salary Category")
plt.ylabel("Count")
plt.show()

### Confusion Matrix Heatmap

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['high', 'low', 'medium']

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

## 2. Test the Model with Sample Predictions

### Create Test Cases


In [None]:
# Sample test cases (replace with your own examples)
test_cases = pd.DataFrame({
    'industry': ['computing or tech', 'nonprofits', 'engineering or manufacturing'],
    'country': ['usa', 'canada', 'uk'],
    'highest level of education completed': ['master\'s degree', 'college degree', 'phd'],
    'job_title_grouped': ['software engineer', 'program manager', 'research engineer']
})

### Encode Test Cases

In [None]:
# Use the same encoder from training
encoded_test = encoder.transform(test_cases)

### Predict Salary Categories

In [None]:
# Predict
predictions = model.predict(encoded_test)
test_cases['predicted_salary'] = predictions

# Display results
print(test_cases)

## 3. Feature Importance Analysis


### Plot Top Influential Features

In [None]:
# Extract feature names and log probabilities
feature_log_probs = model.feature_log_prob_
feature_names = encoder.get_feature_names_out(['industry', 'country', 'education', 'job_title'])

# Create a DataFrame for visualization
importance_df = pd.DataFrame({
    'feature': feature_names,
    'high_salary_prob': feature_log_probs[2],  # High salary class
    'medium_salary_prob': feature_log_probs[1], # Medium salary class
    'low_salary_prob': feature_log_probs[0]     # Low salary class
})

# Top 10 features for "high" salary
top_high = importance_df.nlargest(10, 'high_salary_prob')

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=top_high, x='high_salary_prob', y='feature', palette='rocket')
plt.title("Top Features Predicting High Salary")
plt.xlabel("Log Probability (High Salary)")
plt.ylabel("Feature")
plt.show()