# Artificial Intelligence Final Project

## Importing Libraries

In [140]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from dmba import regressionSummary

## Importing Dataset

In [141]:
df = pd.read_csv('Salary.csv')
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0
...,...,...,...,...,...,...,...,...,...
6679,49.0,Female,3,Director of Marketing,20.0,200000.0,UK,Mixed,0
6680,32.0,Male,0,Sales Associate,3.0,50000.0,Australia,Australian,0
6681,30.0,Female,1,Financial Manager,4.0,55000.0,China,Chinese,0
6682,46.0,Male,2,Marketing Manager,14.0,140000.0,China,Korean,0


## Checking for Null Values

In [142]:
df.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
Country                0
Race                   0
Senior                 0
dtype: int64

## Using Label Encoder to Change Categorical Columns to Numerical

In [143]:
encoder = LabelEncoder()

# Categorical columns to encode
columns_to_encode = ['Gender', 'Job Title', 'Country', 'Race']

# Empty dataframe that we'll fill with encoded values
encoded_df = pd.DataFrame()

for col in columns_to_encode:
    encoded_values = encoder.fit_transform(df[col])
    
    # Creating the new "encoded" column in the dataframe
    encoded_df[col + '_Encoded'] = encoded_values

# Concatenating the original DataFrame and the encoded DataFrame created above
df = pd.concat([df, encoded_df], axis=1)
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior,Gender_Encoded,Job Title_Encoded,Country_Encoded,Race_Encoded
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0,1,112,3,9
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0,0,24,4,5
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1,1,72,1,9
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0,0,100,4,5
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0,1,34,4,1


## Splitting Target and Features

In [144]:
# Target
y = df['Salary']

# Features - Only selecting the encoded features
X = df.drop(columns=['Gender', 'Job Title', 'Country', 'Race', 'Salary'])
X

Unnamed: 0,Age,Education Level,Years of Experience,Senior,Gender_Encoded,Job Title_Encoded,Country_Encoded,Race_Encoded
0,32.0,1,5.0,0,1,112,3,9
1,28.0,2,3.0,0,0,24,4,5
2,45.0,3,15.0,1,1,72,1,9
3,36.0,1,7.0,0,0,100,4,5
4,52.0,2,20.0,0,1,34,4,1
...,...,...,...,...,...,...,...,...
6679,49.0,3,20.0,0,0,42,3,7
6680,32.0,0,3.0,0,1,100,0,2
6681,30.0,1,4.0,0,0,51,2,4
6682,46.0,2,14.0,0,1,76,2,6


## Splitting into Testing and Training Data

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Defining and Fitting the Model

In [146]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

## Running Model and Displaying Multiple Linear Regression Equation

In [147]:
# Predicting values based on testing data
y_pred = model.predict(X_test)

# Finding the Intercept
print(f'Intercept: {model.intercept_}')

# Finding the coefficients for each feature
print(f'\nModel Coeffiecients:\nAge: {model.coef_[0]}\nEducation Level: {model.coef_[1]}\nYears of Experience: {model.coef_[2]}\nSenior: {model.coef_[3]}\nGender: {model.coef_[4]}\nJob Title: {model.coef_[5]}\nCountry: {model.coef_[6]}\nRace: {model.coef_[7]}')

# Full Model Equation
print(f'\nModel Equation:\n{model.coef_[0]}(Age) + {model.coef_[1]}(Education Level) + {model.coef_[2]}(Years of Experience) + {model.coef_[3]}(Senior) + {model.coef_[4]}(Gender) + {model.coef_[5]}(Job Title) + {model.coef_[6]}(Country) + {model.coef_[7]}(Race)')

Intercept: 96610.76311745375

Model Coeffiecients:
Age: -2093.1634153667123
Education Level: 15107.399327109197
Years of Experience: 8265.251174500352
Senior: -5899.575267475045
Gender: 6769.941945053545
Job Title: -59.135948169720905
Country: -295.0833229924154
Race: -35.2037342906259

Model Equation:
-2093.1634153667123(Age) + 15107.399327109197(Education Level) + 8265.251174500352(Years of Experience) + -5899.575267475045(Senior) + 6769.941945053545(Gender) + -59.135948169720905(Job Title) + -295.0833229924154(Country) + -35.2037342906259(Race)


## Assesing Performance and Results of the Model

In [148]:
# Comparing to actual values to predicted values
y_pred = np.round(y_pred, 0)
df2 = pd.DataFrame({'Age': X_test['Age'], 'Education Level': X_test['Education Level'], 'Years of Experience': X_test['Years of Experience'], 'Senior': X_test['Senior'],'Gender': X_test['Gender_Encoded'], 
                    'Job Title': X_test['Job Title_Encoded'],'Country': X_test['Country_Encoded'], 'Race': X_test['Race_Encoded'], 'Actual Salary': y_test , 'Predicted Salary': y_pred, 'Difference': abs(y_test - y_pred)})
df2

Unnamed: 0,Age,Education Level,Years of Experience,Senior,Gender,Job Title,Country,Race,Actual Salary,Predicted Salary,Difference
6220,32.0,1,7.0,0,1,88,0,1,120000.0,104124.0,15876.0
3746,27.0,1,3.0,0,1,112,3,1,60000.0,79225.0,19225.0
3559,30.0,0,2.0,0,1,56,3,7,33000.0,52673.0,19673.0
729,29.0,2,6.0,0,0,27,0,1,180000.0,114083.0,65917.0
6682,46.0,2,14.0,0,1,76,2,6,140000.0,147728.0,7728.0
...,...,...,...,...,...,...,...,...,...,...,...
2346,30.0,1,5.0,0,0,112,0,1,90000.0,83591.0,6409.0
326,38.0,1,10.0,1,0,6,2,9,110000.0,107669.0,2331.0
1524,28.0,1,2.0,0,1,53,3,7,70000.0,72144.0,2144.0
5845,26.0,1,2.0,0,1,86,3,9,55000.0,74309.0,19309.0


In [149]:
# Presenting regression statistics
regressionSummary(y_test, y_pred)


Regression statistics

                      Mean Error (ME) : -175.0845
       Root Mean Squared Error (RMSE) : 27969.9491
            Mean Absolute Error (MAE) : 21965.9850
          Mean Percentage Error (MPE) : -21.3839
Mean Absolute Percentage Error (MAPE) : 35.2633
