In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [4]:
salary1_data_path = os.getenv("HOME") + "/programming/AI/AIFFEL/AIFFEL_quest_cr/Python/data/salary_1.csv"
salary2_data_path = os.getenv("HOME") + "/programming/AI/AIFFEL/AIFFEL_quest_cr/Python/data/salary_2.csv"
cpi_path = os.getenv("HOME") + "/programming/AI/AIFFEL/AIFFEL_quest_cr/Python/data/cpi.csv"
job_path = os.getenv("HOME") + "/programming/AI/AIFFEL/AIFFEL_quest_cr/Python/data/job.csv"

In [18]:
salary1_data = pd.read_csv(salary1_data_path)
salary2_data = pd.read_csv(salary2_data_path)
cpi_data = pd.read_csv(cpi_path)
job_data = pd.read_csv(job_path)

In [19]:
salary_df = pd.concat([salary1_data, salary2_data])
salary_df.reset_index(drop = True, inplace = True)

In [20]:
salary_df = salary_df.merge(job_data, on = 'Job Title', how = 'left')
salary_df.drop('Job Title', axis=1, inplace=True)
salary_df = pd.get_dummies(salary_df, columns=['Gender', 'Country', 'Race', 'Jobs'], drop_first=True)
salary_df

Unnamed: 0,Age,Education Level,Years of Experience,Salary,Senior,Gender_Male,Country_Canada,Country_China,Country_UK,Country_USA,...,Jobs_Social Media,Jobs_Software,Jobs_Supply Chain,Jobs_Supply Chain Manager,Jobs_Technical Support Specialist,Jobs_Technical Writer,Jobs_Training Specialist,Jobs_UX,Jobs_Web Designer,Jobs_Web Developer
0,32.0,1,5.0,90000,0,True,False,False,True,False,...,False,True,False,False,False,False,False,False,False,False
1,28.0,2,3.0,65000,0,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,45.0,3,15.0,150000,1,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,36.0,1,7.0,60000,0,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,52.0,2,20.0,200000,0,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,49.0,3,20.0,200000,0,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
6680,32.0,0,3.0,50000,0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6681,30.0,1,4.0,55000,0,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
6682,46.0,2,14.0,140000,0,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
salary_df['Age'].mean()

np.float64(33.61152694610779)

In [8]:
salary_df['Age'].std()

np.float64(7.595505724139926)

In [9]:
# Standard Scaling
(salary_df['Age'] - salary_df['Age'].mean()) / salary_df['Age'].std()

0      -0.212168
1      -0.738796
2       1.499370
3       0.314459
4       2.420968
          ...   
6679    2.025997
6680   -0.212168
6681   -0.475482
6682    1.631027
6683   -1.002109
Name: Age, Length: 6684, dtype: float64

In [10]:
# Robust Scaling
Q1 = salary_df['Age'].quantile(0.25)
Q2 = salary_df['Age'].quantile(0.5)
Q3 = salary_df['Age'].quantile(0.75)
(salary_df['Age'] - Q2) / (Q3 - Q1)

0       0.0
1      -0.4
2       1.3
3       0.4
4       2.0
       ... 
6679    1.7
6680    0.0
6681   -0.2
6682    1.4
6683   -0.6
Name: Age, Length: 6684, dtype: float64

In [11]:
# Min-Max Scaling
min = salary_df['Age'].min()
max = salary_df['Age'].max()
(salary_df['Age'] - min) / (max - min)

0       0.268293
1       0.170732
2       0.585366
3       0.365854
4       0.756098
          ...   
6679    0.682927
6680    0.268293
6681    0.219512
6682    0.609756
6683    0.121951
Name: Age, Length: 6684, dtype: float64

In [14]:
ss = StandardScaler()
rs = RobustScaler()
mm = MinMaxScaler()

In [21]:
ss.fit(salary_df)

In [24]:
salary_df.columns

Index(['Age', 'Education Level', 'Years of Experience', 'Salary', 'Senior',
       'Gender_Male', 'Country_Canada', 'Country_China', 'Country_UK',
       'Country_USA', 'Race_Asian', 'Race_Australian', 'Race_Black',
       'Race_Chinese', 'Race_Hispanic', 'Race_Korean', 'Race_Mixed',
       'Race_Welsh', 'Race_White', 'Jobs_Accountant', 'Jobs_Administrative',
       'Jobs_Advertising Coordinator', 'Jobs_Business Analyst',
       'Jobs_Business Development', 'Jobs_Business Development ',
       'Jobs_Business Development Manager', 'Jobs_CEO', 'Jobs_CTO',
       'Jobs_Consultant', 'Jobs_Copywriter', 'Jobs_Creative Director',
       'Jobs_Customer Service', 'Jobs_Customer Success', 'Jobs_Data Analyst',
       'Jobs_Data Engineer', 'Jobs_Data Entry Clerk', 'Jobs_Data Scientist',
       'Jobs_Delivery Driver', 'Jobs_Designer',
       'Jobs_Digital Content Producer', 'Jobs_Director', 'Jobs_Engineer',
       'Jobs_Engineering', 'Jobs_Event Coordinator', 'Jobs_Financial',
       'Jobs_Graphic 

In [27]:
ss_df = pd.DataFrame(ss.transform(salary_df), columns=salary_df.columns)
ss_df

Unnamed: 0,Age,Education Level,Years of Experience,Salary,Senior,Gender_Male,Country_Canada,Country_China,Country_UK,Country_USA,...,Jobs_Social Media,Jobs_Software,Jobs_Supply Chain,Jobs_Supply Chain Manager,Jobs_Technical Support Specialist,Jobs_Technical Writer,Jobs_Training Specialist,Jobs_UX,Jobs_Web Designer,Jobs_Web Developer
0,-0.212184,-0.706924,-0.505793,-0.479277,-0.409281,0.905957,-0.496538,-0.500514,2.004499,-0.504484,...,-0.051964,1.447715,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
1,-0.738851,0.428913,-0.833803,-0.952736,-0.409281,-1.103806,-0.496538,-0.500514,-0.498878,1.982222,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
2,1.499482,1.564750,1.134259,0.657025,2.443309,0.905957,2.013945,-0.500514,-0.498878,-0.504484,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
3,0.314482,-0.706924,-0.177783,-1.047428,-0.409281,-1.103806,-0.496538,-0.500514,-0.498878,1.982222,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
4,2.421149,0.428913,1.954285,1.603944,-0.409281,0.905957,-0.496538,-0.500514,-0.498878,1.982222,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,2.026149,1.564750,1.954285,1.603944,-0.409281,-1.103806,-0.496538,-0.500514,2.004499,-0.504484,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
6680,-0.212184,-1.842761,-0.833803,-1.236811,-0.409281,0.905957,-0.496538,-0.500514,-0.498878,-0.504484,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
6681,-0.475518,-0.706924,-0.669798,-1.142119,-0.409281,-1.103806,-0.496538,1.997945,-0.498878,-0.504484,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
6682,1.631149,0.428913,0.970254,0.467642,-0.409281,0.905957,-0.496538,1.997945,-0.498878,-0.504484,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284


In [28]:
rs.fit(salary_df)

In [31]:
mm.fit(salary_df)

In [32]:
rs_df = pd.DataFrame(rs.transform(salary_df), columns=salary_df.columns)
mm_df = pd.DataFrame(mm.transform(salary_df), columns=salary_df.columns)


In [33]:
rs_df

Unnamed: 0,Age,Education Level,Years of Experience,Salary,Senior,Gender_Male,Country_Canada,Country_China,Country_UK,Country_USA,...,Jobs_Social Media,Jobs_Software,Jobs_Supply Chain,Jobs_Supply Chain Manager,Jobs_Technical Support Specialist,Jobs_Technical Writer,Jobs_Training Specialist,Jobs_UX,Jobs_Web Designer,Jobs_Web Developer
0,0.0,0.0,-0.222222,-0.277778,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.4,1.0,-0.444444,-0.555556,0.0,-1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.3,2.0,0.888889,0.388889,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.4,0.0,0.000000,-0.611111,0.0,-1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,1.0,1.444444,0.944444,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,1.7,2.0,1.444444,0.944444,0.0,-1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6680,0.0,-1.0,-0.444444,-0.722222,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6681,-0.2,0.0,-0.333333,-0.666667,0.0,-1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6682,1.4,1.0,0.777778,0.277778,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
mm_df

Unnamed: 0,Age,Education Level,Years of Experience,Salary,Senior,Gender_Male,Country_Canada,Country_China,Country_UK,Country_USA,...,Jobs_Social Media,Jobs_Software,Jobs_Supply Chain,Jobs_Supply Chain Manager,Jobs_Technical Support Specialist,Jobs_Technical Writer,Jobs_Training Specialist,Jobs_UX,Jobs_Web Designer,Jobs_Web Developer
0,0.268293,0.333333,0.072289,0.359103,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.170732,0.666667,0.048193,0.258963,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.585366,1.000000,0.192771,0.599439,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.365854,0.333333,0.096386,0.238935,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.756098,0.666667,0.253012,0.799720,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,0.682927,1.000000,0.253012,0.799720,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6680,0.268293,0.000000,0.048193,0.198878,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6681,0.219512,0.333333,0.060241,0.218906,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6682,0.609756,0.666667,0.180723,0.559383,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
mm_df.describe()

Unnamed: 0,Age,Education Level,Years of Experience,Salary,Senior,Gender_Male,Country_Canada,Country_China,Country_UK,Country_USA,...,Jobs_Social Media,Jobs_Software,Jobs_Supply Chain,Jobs_Supply Chain Manager,Jobs_Technical Support Specialist,Jobs_Technical Writer,Jobs_Training Specialist,Jobs_UX,Jobs_Web Designer,Jobs_Web Developer
count,6680.0,6684.0,6684.0,6684.0,6684.0,6684.0,6684.0,6684.0,6684.0,6684.0,...,6684.0,6684.0,6684.0,6684.0,6684.0,6684.0,6684.0,6684.0,6684.0,6684.0
mean,0.307598,0.540794,0.109446,0.460473,0.143477,0.549222,0.197786,0.200329,0.199282,0.202873,...,0.002693,0.32301,0.00015,0.00015,0.000449,0.00015,0.000299,0.000898,0.00015,0.0193
std,0.185256,0.293491,0.073468,0.211523,0.350585,0.497609,0.39836,0.400277,0.39949,0.402168,...,0.051828,0.467662,0.012232,0.012232,0.021183,0.012232,0.017297,0.02995,0.012232,0.137587
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.170732,0.333333,0.048193,0.278991,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.268293,0.333333,0.096386,0.459243,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.414634,0.666667,0.156627,0.639495,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [39]:
ss_df = pd.DataFrame(ss.fit_transform(salary_df), columns=salary_df.columns)
ss_df

Unnamed: 0,Age,Education Level,Years of Experience,Salary,Senior,Gender_Male,Country_Canada,Country_China,Country_UK,Country_USA,...,Jobs_Social Media,Jobs_Software,Jobs_Supply Chain,Jobs_Supply Chain Manager,Jobs_Technical Support Specialist,Jobs_Technical Writer,Jobs_Training Specialist,Jobs_UX,Jobs_Web Designer,Jobs_Web Developer
0,-0.212184,-0.706924,-0.505793,-0.479277,-0.409281,0.905957,-0.496538,-0.500514,2.004499,-0.504484,...,-0.051964,1.447715,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
1,-0.738851,0.428913,-0.833803,-0.952736,-0.409281,-1.103806,-0.496538,-0.500514,-0.498878,1.982222,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
2,1.499482,1.564750,1.134259,0.657025,2.443309,0.905957,2.013945,-0.500514,-0.498878,-0.504484,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
3,0.314482,-0.706924,-0.177783,-1.047428,-0.409281,-1.103806,-0.496538,-0.500514,-0.498878,1.982222,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
4,2.421149,0.428913,1.954285,1.603944,-0.409281,0.905957,-0.496538,-0.500514,-0.498878,1.982222,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,2.026149,1.564750,1.954285,1.603944,-0.409281,-1.103806,-0.496538,-0.500514,2.004499,-0.504484,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
6680,-0.212184,-1.842761,-0.833803,-1.236811,-0.409281,0.905957,-0.496538,-0.500514,-0.498878,-0.504484,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
6681,-0.475518,-0.706924,-0.669798,-1.142119,-0.409281,-1.103806,-0.496538,1.997945,-0.498878,-0.504484,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284
6682,1.631149,0.428913,0.970254,0.467642,-0.409281,0.905957,-0.496538,1.997945,-0.498878,-0.504484,...,-0.051964,-0.690744,-0.012232,-0.012232,-0.02119,-0.012232,-0.017301,-0.029975,-0.012232,-0.140284


In [40]:
mm_df = pd.DataFrame(mm.fit_transform(salary_df), columns=salary_df.columns)
mm_df

Unnamed: 0,Age,Education Level,Years of Experience,Salary,Senior,Gender_Male,Country_Canada,Country_China,Country_UK,Country_USA,...,Jobs_Social Media,Jobs_Software,Jobs_Supply Chain,Jobs_Supply Chain Manager,Jobs_Technical Support Specialist,Jobs_Technical Writer,Jobs_Training Specialist,Jobs_UX,Jobs_Web Designer,Jobs_Web Developer
0,0.268293,0.333333,0.072289,0.359103,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.170732,0.666667,0.048193,0.258963,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.585366,1.000000,0.192771,0.599439,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.365854,0.333333,0.096386,0.238935,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.756098,0.666667,0.253012,0.799720,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6679,0.682927,1.000000,0.253012,0.799720,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6680,0.268293,0.000000,0.048193,0.198878,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6681,0.219512,0.333333,0.060241,0.218906,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6682,0.609756,0.666667,0.180723,0.559383,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
