In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = 'data_engineer_salary_2024.csv'  # Update with your actual file name
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

# Data Engineer Salary Analysis in 2024
"""
This notebook analyzes the salary data for entry-level data engineering positions in the US for 2024. 
The primary goals are to identify keywords for job searches and determine the median salary for various job titles.
"""

# Check for null values and data types
data.info()

# Display summary statistics
data.describe()

# Filter for entry-level positions in the US for the year 2024
entry_level_data = data[(data['experience_level'] == 'EN') & (data['work_year'] == 2024) & (data['employee_residence'] == 'US')]

# Calculate median salary for each job title
median_salaries = entry_level_data.groupby('job_title')['salary_in_usd'].median().sort_values(ascending=False)

# Display median salaries
median_salaries

# Visualize median salaries
plt.figure(figsize=(12, 6))
sns.barplot(x=median_salaries.index, y=median_salaries.values)
plt.xticks(rotation=90)
plt.title('Median Salary for Entry-Level Job Titles in 2024')
plt.ylabel('Median Salary (USD)')
plt.xlabel('Job Title')
plt.tight_layout()
plt.show()

# Discussion on salary distribution
"""
The bar chart above illustrates the median salaries for various entry-level job titles in 2024. 
This visualization provides insight into which roles may offer higher compensation.
"""

# Linear Model Analysis
"""
Now, let's build a linear model to predict salaries based on various features.
"""

# Convert categorical variables to dummy variables
data_encoded = pd.get_dummies(entry_level_data, columns=['experience_level', 'job_title'], drop_first=True)

# Define features (X) and target (y)
X = data_encoded.drop(columns=['salary_in_usd'])
y = data_encoded['salary_in_usd']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')

# Discussion on linear model performance
"""
The linear regression model was trained to predict salaries based on various features. 
A low Mean Squared Error (MSE) indicates that the predictions are close to the actual salaries, 
while a high R² score suggests that the model explains a significant portion of the variance in salary.
"""
