# Artificial Intelligence Final Project

## Importing Libraries

In [14]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from dmba import regressionSummary
import pickle as pk
encoder = LabelEncoder()

## Importing Dataset

In [15]:
df = pd.read_csv('Salary.csv')
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0
5,29.0,Male,1,Marketing Analyst,2.0,55000.0,USA,Hispanic,0
6,42.0,Female,2,Product Manager,12.0,120000.0,USA,Asian,0
7,31.0,Male,1,Sales Manager,4.0,80000.0,China,Korean,0
8,26.0,Female,1,Marketing Coordinator,1.0,45000.0,China,Chinese,0
9,38.0,Male,3,Scientist,10.0,110000.0,Australia,Australian,1


## Checking for Null Values

In [16]:
df.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
Country                0
Race                   0
Senior                 0
dtype: int64

## Printing out what the label encoder will change categorical values to

In [17]:
# Columns to encode
columns_to_encode = ['Gender', 'Job Title', 'Country', 'Race']

# Create a new DataFrame to store the original and encoded values
encoded_values_df = pd.DataFrame()

for column in columns_to_encode:
    encoded_values = encoder.fit_transform(df[column])
    encoded_values_df[column] = df[column]
    encoded_values_df[f'{column}_Encoded'] = encoded_values

# Getting unique combinations to see what values were encoded into
gender_encoded = encoded_values_df.groupby(['Gender', 'Gender_Encoded']).apply(lambda x: x.name)
job_title_encoded = encoded_values_df.groupby(['Job Title', 'Job Title_Encoded']).apply(lambda x: x.name)
country__encoded = encoded_values_df.groupby(['Country', 'Country_Encoded']).apply(lambda x: x.name)
race_encoded = encoded_values_df.groupby(['Race', 'Race_Encoded']).apply(lambda x: x.name)


encoded_values = gender_encoded, job_title_encoded, country__encoded, race_encoded

# Printing encoding results
pd.set_option('display.max_rows', None) 
for value in encoded_values:
    print(f'{value}\n')


Gender  Gender_Encoded
Female  0                 (Female, 0)
Male    1                   (Male, 1)
dtype: object

Job Title                         Job Title_Encoded
Account Executive                 0                                    (Account Executive, 0)
Account Manager                   1                                      (Account Manager, 1)
Accountant                        2                                           (Accountant, 2)
Administrative Assistant          3                             (Administrative Assistant, 3)
Advertising Coordinator           4                              (Advertising Coordinator, 4)
Back end Developer                5                                   (Back end Developer, 5)
Business Analyst                  6                                     (Business Analyst, 6)
Business Development Associate    7                       (Business Development Associate, 7)
Business Development Manager      8                         (Business Development 

## Using Label Encoder to Change Categorical Columns to Numerical

In [18]:
pd.reset_option('display.max_rows')
label_encoder_map = {}

# Categorical columns to encode
columns_to_encode = ['Gender', 'Job Title', 'Country', 'Race']

# Empty dataframe that we'll fill with encoded values
encoded_df = pd.DataFrame()

df_copy = df.copy()
df_encoded_labels = df_copy.copy()

for i in range(len(columns_to_encode)):
    labels = df_copy[columns_to_encode[i]].astype('category').cat.categories.tolist()
    replace_map_comp = {columns_to_encode[i] : {k :v for k, v in zip( labels, list(range(0, len(labels))))}}

    # each replace map is for a column and is added to the total data encoder map
    label_encoder_map.update(replace_map_comp)
    # replace the data with the encoded values according the our map
    df_encoded_labels.replace(replace_map_comp, inplace=True)

for col in columns_to_encode:
    encoded_values = encoder.fit_transform(df[col])
    
    # Creating the new "encoded" column in the dataframe
    encoded_df[col + '_Encoded'] = encoded_values

# Concatenating the original DataFrame and the encoded DataFrame created above
df = pd.concat([df, encoded_df], axis=1)
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior,Gender_Encoded,Job Title_Encoded,Country_Encoded,Race_Encoded
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0,1,112,3,9
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0,0,24,4,5
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1,1,72,1,9
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0,0,100,4,5
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0,1,34,4,1


In [19]:
df_encoded_labels

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior
0,32.0,1,1,112,5.0,90000.0,3,9,0
1,28.0,0,2,24,3.0,65000.0,4,5,0
2,45.0,1,3,72,15.0,150000.0,1,9,1
3,36.0,0,1,100,7.0,60000.0,4,5,0
4,52.0,1,2,34,20.0,200000.0,4,1,0
...,...,...,...,...,...,...,...,...,...
6679,49.0,0,3,42,20.0,200000.0,3,7,0
6680,32.0,1,0,100,3.0,50000.0,0,2,0
6681,30.0,0,1,51,4.0,55000.0,2,4,0
6682,46.0,1,2,76,14.0,140000.0,2,6,0


## Pickle ( serialize ) the label encoder

In [20]:
pk.dump(label_encoder_map, open("label_encoder_map.pkl", "wb"))

## Splitting Target and Features

In [21]:
# Target
y = df['Salary']

# Features - Only selecting the encoded features
X = df.drop(columns=['Gender', 'Job Title', 'Country', 'Race', 'Salary'])
X

Unnamed: 0,Age,Education Level,Years of Experience,Senior,Gender_Encoded,Job Title_Encoded,Country_Encoded,Race_Encoded
0,32.0,1,5.0,0,1,112,3,9
1,28.0,2,3.0,0,0,24,4,5
2,45.0,3,15.0,1,1,72,1,9
3,36.0,1,7.0,0,0,100,4,5
4,52.0,2,20.0,0,1,34,4,1
...,...,...,...,...,...,...,...,...
6679,49.0,3,20.0,0,0,42,3,7
6680,32.0,0,3.0,0,1,100,0,2
6681,30.0,1,4.0,0,0,51,2,4
6682,46.0,2,14.0,0,1,76,2,6


## Splitting into Testing and Training Data

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Defining and Fitting the Model

In [23]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

## Running Model

In [24]:
# Predicting values based on testing data
y_pred = model.predict(X_test)
# print(model.feature_importances_)
print(f'\nFeature Importances:\nAge: {model.feature_importances_[0]}\nEducation Level: {model.feature_importances_[1]}\nYears of Experience: {model.feature_importances_[2]}\nSenior: {model.feature_importances_[3]}\nGender: {model.feature_importances_[4]}\nJob Title: {model.feature_importances_[5]}\nCountry: {model.feature_importances_[6]}\nRace: {model.feature_importances_[7]}')


Feature Importances:
Age: 0.06003850888680796
Education Level: 0.02978450210282784
Years of Experience: 0.7483440107327428
Senior: 0.006146641676005909
Gender: 0.006725072085747871
Job Title: 0.14087389608664463
Country: 0.0034277183400539964
Race: 0.004659650089169078


## Assesing Performance and Results of the Model

In [25]:
# Comparing to actual values to predicted values
y_pred = np.round(y_pred, 0)
df2 = pd.DataFrame({'Age': X_test['Age'], 'Education Level': X_test['Education Level'], 'Years of Experience': X_test['Years of Experience'], 'Senior': X_test['Senior'],'Gender': X_test['Gender_Encoded'], 
                    'Job Title': X_test['Job Title_Encoded'],'Country': X_test['Country_Encoded'], 'Race': X_test['Race_Encoded'], 'Actual Salary': y_test , 'Predicted Salary': y_pred, 'Difference': abs(y_test - y_pred)})
df2

Unnamed: 0,Age,Education Level,Years of Experience,Senior,Gender,Job Title,Country,Race,Actual Salary,Predicted Salary,Difference
6220,32.0,1,7.0,0,1,88,0,1,120000.0,120000.0,0.0
3746,27.0,1,3.0,0,1,112,3,1,60000.0,82441.0,22441.0
3559,30.0,0,2.0,0,1,56,3,7,33000.0,33080.0,80.0
729,29.0,2,6.0,0,0,27,0,1,180000.0,175120.0,4880.0
6682,46.0,2,14.0,0,1,76,2,6,140000.0,145154.0,5154.0
...,...,...,...,...,...,...,...,...,...,...,...
2346,30.0,1,5.0,0,0,112,0,1,90000.0,90340.0,340.0
326,38.0,1,10.0,1,0,6,2,9,110000.0,112653.0,2653.0
1524,28.0,1,2.0,0,1,53,3,7,70000.0,69950.0,50.0
5845,26.0,1,2.0,0,1,86,3,9,55000.0,51950.0,3050.0


In [26]:
# Presenting regression statistics
regressionSummary(y_test, y_pred)


Regression statistics

                      Mean Error (ME) : -681.5849
       Root Mean Squared Error (RMSE) : 8017.5761
            Mean Absolute Error (MAE) : 3604.9476
          Mean Percentage Error (MPE) : -9.1246
Mean Absolute Percentage Error (MAPE) : 11.4208
