### Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

AttributeError: module 'numpy' has no attribute '__version__'

### Loading and Understanding the Dataset

In [None]:
# Load Dataset

df = pd.read_csv('Salary_Data.csv')

In [None]:
# View few first rows of the dataset

df.head()

In [None]:
# Understand dataset's dimensions

df.shape

In [None]:
# Data types

df.info()

In [None]:
# Summary statistics

df.describe()

### Data Cleaning and Preprocessing

In [None]:
df.isnull().sum()

##### Since we dont have too many missing values, we will remove the missing data from the dataset.

In [None]:
df_clean = df.dropna().copy()

df_clean.isnull().sum()

##### We will also convert 'Years of Experience' and 'Age' to int, to make outputs cleaner.

In [None]:
df_clean['Years of Experience'] = df_clean['Years of Experience'].astype(int)
df_clean['Age'] = df_clean['Age'].astype(int)

df_clean.info()

##### rename columns

In [None]:
df_clean = df_clean.rename(columns = {'Education Level': 'Education_Level', 'Job Title': 'Job_Title', 'Years of Experience': 'Years_of_Experience'})
df_clean.head()

In [None]:
df_clean['Education_Level'].unique()

In [None]:
df_clean['Education_Level'] = df_clean['Education_Level'].str.strip().str.lower()
df_clean['Education_Level'].unique()

In [None]:
edu_map = {
    "high school": "High School",
    "bachelor's degree": "Bachelor's Degree",
    "bachelor's": "Bachelor's Degree",
    "master's degree": "Master's Degree",
    "master's": "Master's Degree",
    "phd": "PhD"
}

df_clean['Education_Level'] = df_clean['Education_Level'].map(edu_map)
df_clean['Education_Level'].unique()

# Exploratory Data Analysis

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

sns.histplot(data = df_clean, x = 'Salary')
plt.title('Salary Distribution')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))

sns.histplot(data = df_clean, x = 'Years_of_Experience')
plt.title('Distribution of Years of Experience')
plt.show()

As expected we got left skewed histogram, a lot of employees with mid-range experience (5-15 years) and fewer with extremely high experience (20+ years)

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.scatter(df_clean['Salary'], df_clean['Years_of_Experience'])
plt.title('Experience vs Salary')
plt.show()

We can observe a clear positive trend, generally, as years of experience increase, salary tends to increase.

In [None]:
plt.figure(figsize = (4, 4))

sns.heatmap(df_clean[['Age', 'Years_of_Experience', 'Salary']].corr(), annot = True, cmap = 'coolwarm')
plt.show()

* We can notice a strong positive relationship between Years of Experience and Salary as we saw with scatter plot too.
* Age and Years of Experience highly correlated aswell, since more experience employees tend to be older.
* Age and Salary a bit lower, but still positive. Age influences salary partly because it proxies experiencem but age alone isn't as strong predictor as actual experience.
* These correlations confirm that Years of Experience is the single strongest predictor of salary.

# Building Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split