### Import Libraries

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso

### Import Dataset

In [28]:
df = pd.read_csv("Salary_Data.csv")

In [29]:
print(df.head())

    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary  
0   90000.0  
1   65000.0  
2  150000.0  
3   60000.0  
4  200000.0  


### Check for null values

In [30]:
print(df.isnull().values.any())
print(df.isnull().sum())

True
Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64


### Imputing

In [31]:
num_cols = ["Age","Years of Experience","Salary"]
cat_cols = ["Gender", "Education Level", "Job Title"]

imputer = SimpleImputer(strategy="mean")
df[num_cols] = imputer.fit_transform(df[num_cols])
imputer = SimpleImputer(strategy="most_frequent")
df[cat_cols] = imputer.fit_transform(df[cat_cols])

In [32]:
print(df.isnull().values.any())
print(df.isnull().sum())

False
Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64


### Split Features from the Independent Variable

In [33]:
X = df.iloc[:, :-1]
y = df.iloc[:,-1]

### Encoding Categorical Data

In [34]:
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [35]:
print(df_encoded)

       Age  Years of Experience    Salary  Gender_Male  Gender_Other  \
0     32.0                  5.0   90000.0         True         False   
1     28.0                  3.0   65000.0        False         False   
2     45.0                 15.0  150000.0         True         False   
3     36.0                  7.0   60000.0        False         False   
4     52.0                 20.0  200000.0         True         False   
...    ...                  ...       ...          ...           ...   
6699  49.0                 20.0  200000.0        False         False   
6700  32.0                  3.0   50000.0         True         False   
6701  30.0                  4.0   55000.0        False         False   
6702  46.0                 14.0  140000.0         True         False   
6703  26.0                  1.0   35000.0        False         False   

      Education Level_Bachelor's Degree  Education Level_High School  \
0                                 False                        