### Import Libraries

In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

### Import Dataset

In [82]:
df = pd.read_csv("Salary_Data.csv")

In [83]:
print(df.head())

    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary  
0   90000.0  
1   65000.0  
2  150000.0  
3   60000.0  
4  200000.0  


### Check for null values

In [84]:
print(df.isnull().values.any())
print(df.isnull().sum())

True
Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64


### Imputing

In [85]:
num_cols = ["Age","Years of Experience","Salary"]
cat_cols = ["Gender", "Education Level", "Job Title"]

imputer = SimpleImputer(strategy="mean")
df[num_cols] = imputer.fit_transform(df[num_cols])
imputer = SimpleImputer(strategy="most_frequent")
df[cat_cols] = imputer.fit_transform(df[cat_cols])

In [86]:
print(df.isnull().values.any())
print(df.isnull().sum())

False
Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64


### Split Features from the Independent Variable

In [87]:
X = df.iloc[:, :-1]
y = df.iloc[:,-1]

### Encoding Categorical Data

In [88]:
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [89]:
print(X)

       Age  Years of Experience  Gender_Male  Gender_Other  \
0     32.0                  5.0         True         False   
1     28.0                  3.0        False         False   
2     45.0                 15.0         True         False   
3     36.0                  7.0        False         False   
4     52.0                 20.0         True         False   
...    ...                  ...          ...           ...   
6699  49.0                 20.0        False         False   
6700  32.0                  3.0         True         False   
6701  30.0                  4.0        False         False   
6702  46.0                 14.0         True         False   
6703  26.0                  1.0        False         False   

      Education Level_Bachelor's Degree  Education Level_High School  \
0                                 False                        False   
1                                 False                        False   
2                                 False

### Splitting the Training Set and Test Set

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

In [91]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

       Age  Years of Experience  Gender_Male  Gender_Other  \
4770  26.0                  2.0         True         False   
3644  29.0                  5.0        False         False   
1900  47.0                 16.0         True         False   
4634  27.0                  1.0        False         False   
2971  49.0                 22.0        False         False   
...    ...                  ...          ...           ...   
1259  54.0                 17.0         True         False   
5538  24.0                  0.0        False         False   
3264  26.0                  2.0         True         False   
399   29.0                  4.0         True         False   
2532  38.0                 13.0        False         False   

      Education Level_Bachelor's Degree  Education Level_High School  \
4770                              False                         True   
3644                              False                        False   
1900                              False

### Feature Scaling

In [94]:
scaler = StandardScaler()
columns_to_scale = ["Age", "Years of Experience"]
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

KeyError: ('Age', 'Years of Experience')