# Artificial Intelligence Final Project

In [8]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv('Salary.csv')
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0
...,...,...,...,...,...,...,...,...,...
6679,49.0,Female,3,Director of Marketing,20.0,200000.0,UK,Mixed,0
6680,32.0,Male,0,Sales Associate,3.0,50000.0,Australia,Australian,0
6681,30.0,Female,1,Financial Manager,4.0,55000.0,China,Chinese,0
6682,46.0,Male,2,Marketing Manager,14.0,140000.0,China,Korean,0


## Checking for nulls values

In [9]:
df.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
Country                0
Race                   0
Senior                 0
dtype: int64

### Using Label Encoder to Change Categorical to Numerical

In [10]:
encoder = LabelEncoder()

# Categorical columns to encode
columns_to_encode = ['Gender', 'Job Title', 'Years of Experience', 'Country', 'Race']

# Empty dataframe that we'll fill with encoded values
encoded_df = pd.DataFrame()

for col in columns_to_encode:
    encoded_values = encoder.fit_transform(df[col])
    
    # Creating the new "encoded" column in the dataframe
    encoded_df[col + '_Encoded'] = encoded_values

# Concatenating the original DataFrame and the encoded DataFrame created above
df = pd.concat([df, encoded_df], axis=1)
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior,Gender_Encoded,Job Title_Encoded,Years of Experience_Encoded,Country_Encoded,Race_Encoded
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0,1,112,7,3,9
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0,0,24,5,4,5
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1,1,72,17,1,9
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0,0,100,9,4,5
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0,1,34,22,4,1


### Splitting Target and Features

In [11]:
y = df['Salary']
X = df.drop(columns=['Gender', 'Job Title', 'Country', 'Race', 'Salary'])
X

Unnamed: 0,Age,Education Level,Years of Experience,Senior,Gender_Encoded,Job Title_Encoded,Years of Experience_Encoded,Country_Encoded,Race_Encoded
0,32.0,1,5.0,0,1,112,7,3,9
1,28.0,2,3.0,0,0,24,5,4,5
2,45.0,3,15.0,1,1,72,17,1,9
3,36.0,1,7.0,0,0,100,9,4,5
4,52.0,2,20.0,0,1,34,22,4,1
...,...,...,...,...,...,...,...,...,...
6679,49.0,3,20.0,0,0,42,22,3,7
6680,32.0,0,3.0,0,1,100,5,0,2
6681,30.0,1,4.0,0,0,51,6,2,4
6682,46.0,2,14.0,0,1,76,16,2,6


### Splitting into Testing and Training

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)