In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("../glassdoor_jobs.csv", index_col=0)
df = df.reset_index(drop=True)
df.shape

(7984, 12)

In [5]:
# drop duplicates
df = df.drop_duplicates()

# dropping all the rows where target column value (Salary Estimate) is missing
df = df[df['Salary Estimate']!='-1']
print(df.shape)

# is salary per-hour or not?
df['hourly'] = df['Salary Estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
df['employer_provided'] = df['Salary Estimate'].apply(lambda x: 1 if 'employer provided' in x.lower() else 0)

# parse salary range
salary = df['Salary Estimate'].apply(lambda x: (x.split('(')[0]).replace('$','').replace('K', ''))
salary_range = salary.apply(lambda x: x.lower().replace('per hour', '').replace('employer provided salary:', ''))

def get_max_salary(s):
    try:        
        return (int(s.split('-')[1]))
    except:
        return (int(s.split('-')[0]))

def get_location(s):
    try: 
        if(s.lower()=='Remote'):
            return 'Remote'
        else:
            return (s.split(','))[1]        
    except:
        return s
    
def company_age(y):
    age=-1
    try:     
        if(y!='-1'):
            age = 2021 - int(y)
    except:
        age=-1
        
    return age
    

# get min, max, and average salary from salary range
df['min_salary'] = salary_range.apply(lambda x: int((x.split('-'))[0]))
df['max_salary'] = salary_range.apply(lambda x: get_max_salary(x))
df['avg_salary'] = (df['min_salary']+df['max_salary'])/2

# get job state
df['job_state'] = df['Location'].apply(lambda x: get_location(x))

# get company age
df['age'] = df.Founded.apply(lambda x: company_age(x))
df

(4917, 12)


Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,hourly,employer_provided,min_salary,max_salary,avg_salary,job_state
0,Data Scientist,$74K - $139K,Intro (Use Font Arial 12):\nAs a Data Scientis...,4.4,Applied Information Sciences,"Chevy Chase, MD",501 to 1000 Employees,1982,1982,IT Services,Information Technology,$50 to $100 million (USD),0,0,74,139,106.5,MD
2,Web Developer,$41K - $83K,We are the fastest growing employer of emergin...,4.0,Revature,"Tampa, FL",1001 to 5000 Employees,2003,2003,IT Services,Information Technology,$100 to $500 million (USD),0,0,41,83,62.0,FL
5,Data Analyst,$43K - $88K,"HR Data Analyst - Direct Hire - Buffalo, NY\nO...",4.1,Systems Personnel,"Niagara Falls, NY",1 to 50 Employees,1996,1996,Staffing & Outsourcing,Business Services,$5 to $10 million (USD),0,0,43,88,65.5,NY
7,Database Administrator,$68K - $136K,"Secure our Nation, Ignite your Future\nBecome ...",4.2,ManTech International Corporation,"Fort Meade, MD",5001 to 10000 Employees,1968,1968,Research & Development,Business Services,$1 to $2 billion (USD),0,0,68,136,102.0,MD
9,iOS Developer,$72K - $104K,Are you an iOS developer who loves to create m...,4.1,Garmin,"Olathe, KS",10000+ Employees,1989,1989,Consumer Products Manufacturing,Manufacturing,$2 to $5 billion (USD),0,0,72,104,88.0,KS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7974,Front End Developer,$58K - $117K,US Citizenship is required\nFront end Develope...,-1.0,EDR Technology,"Ashburn, VA",-1,-1,-1,-1,-1,-1,0,0,58,117,87.5,VA
7975,Sr Software Quality Engineer,$78K - $147K,PRA Health Sciences is seeking the best and br...,4.0,PRA Health Sciences,"San Diego, CA",10000+ Employees,1976,1976,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$2 to $5 billion (USD),0,0,78,147,112.5,CA
7977,Lead Mobile Engineer,$79K - $162K,About Us\nWe are a fintech startup with a miss...,-1.0,Treecard,"Austin, TX",-1,-1,-1,-1,-1,-1,0,0,79,162,120.5,TX
7978,Cloud Infrastructure Engineer (SRE/Java),$61K - $132K,"Every day, Global Payments makes it possible f...",3.5,Global Payments,"Columbus, GA",10000+ Employees,1967,1967,Financial Transaction Processing,Finance,$5 to $10 billion (USD),0,0,61,132,96.5,GA


In [9]:
# df.job_state.value_counts()



        


---- 1982 ------
39
---- 2003 ------
18
---- 1996 ------
25
---- 1968 ------
53
---- 1989 ------
32
---- 1994 ------
27
---- 1992 ------
29
---- Company - Private ------
-1
---- 1850 ------
171
---- 1850 ------
171
---- 1956 ------
65
---- 1850 ------
171
---- 1942 ------
79
---- 1942 ------
79
---- 1850 ------
171
---- 1977 ------
44
---- 1850 ------
171
---- 2002 ------
19
---- Government ------
-1
---- 1939 ------
82
---- 1937 ------
84
---- 1970 ------
51
---- 1985 ------
36
---- 1980 ------
41
---- 1850 ------
171
---- 1850 ------
171
---- 1987 ------
34
---- 1980 ------
41
---- 1963 ------
58
---- 1946 ------
75
---- Company - Public ------
-1
---- -1 ------
-1
---- -1 ------
-1
---- Company - Private ------
-1
---- 2003 ------
18
---- 1998 ------
23
---- 1980 ------
41
---- 1850 ------
171
---- 2000 ------
21
---- 1842 ------
179
---- 1975 ------
46
---- 1912 ------
109
---- 1994 ------
27
---- 1980 ------
41
---- 1942 ------
79
---- Company - Private ------
-1
---- Company - Pr

---- 1966 ------
55
---- 1985 ------
36
---- -1 ------
-1
---- 1911 ------
110
---- 1974 ------
47
---- 2001 ------
20
---- -1 ------
-1
---- -1 ------
-1
---- 2007 ------
14
---- 1911 ------
110
---- 1940 ------
81
---- 1980 ------
41
---- 1978 ------
43
---- 1912 ------
109
---- 2005 ------
16
---- 2001 ------
20
---- 1998 ------
23
---- Company - Private ------
-1
---- Company - Private ------
-1
---- Company - Private ------
-1
---- Company - Public ------
-1
---- 1984 ------
37
---- Company - Private ------
-1
---- -1 ------
-1
---- -1 ------
-1
---- 1946 ------
75
---- 1952 ------
69
---- 1916 ------
105
---- 1963 ------
58
---- Company - Private ------
-1
---- 1916 ------
105
---- 1989 ------
32
---- 1946 ------
75
---- 2005 ------
16
---- Company - Private ------
-1
---- 1984 ------
37
---- Company - Public ------
-1
---- -1 ------
-1
---- -1 ------
-1
---- 1962 ------
59
---- -1 ------
-1
---- 1997 ------
24
---- 1986 ------
35
---- Government ------
-1
---- 1963 ------
58
---

---- Contract ------
-1
---- 2002 ------
19
---- -1 ------
-1
---- 1975 ------
46
---- 2000 ------
21
---- Company - Private ------
-1
---- 1885 ------
136
---- 1885 ------
136
---- -1 ------
-1
---- 2003 ------
18
---- -1 ------
-1
---- 1984 ------
37
---- Company - Public ------
-1
---- Company - Private ------
-1
---- 1878 ------
143
---- 1976 ------
45
---- 2003 ------
18
---- 1966 ------
55
---- -1 ------
-1
---- -1 ------
-1
---- -1 ------
-1
---- 1980 ------
41
---- 1955 ------
66
---- -1 ------
-1
---- Contract ------
-1
---- 1974 ------
47
---- 1999 ------
22
---- 1850 ------
171
---- -1 ------
-1
---- Company - Private ------
-1
---- Company - Private ------
-1
---- 2001 ------
20
---- -1 ------
-1
---- Company - Private ------
-1
---- 1997 ------
24
---- 1979 ------
42
---- Company - Private ------
-1
---- Company - Private ------
-1
---- 1911 ------
110
---- -1 ------
-1
---- -1 ------
-1
---- 1978 ------
43
---- Company - Public ------
-1
---- 1988 ------
33
---- Company -

---- -1 ------
-1
---- 1989 ------
32
---- 2007 ------
14
---- -1 ------
-1
---- Contract ------
-1
---- 2012 ------
9
---- Company - Public ------
-1
---- -1 ------
-1
---- -1 ------
-1
---- Company - Private ------
-1
---- -1 ------
-1
---- 1996 ------
25
---- 1994 ------
27
---- Private Practice / Firm ------
-1
---- Company - Private ------
-1
---- 1999 ------
22
---- -1 ------
-1
---- -1 ------
-1
---- -1 ------
-1
---- 1996 ------
25
---- 2013 ------
8
---- -1 ------
-1
---- 2010 ------
11
---- 1996 ------
25
---- -1 ------
-1
---- Company - Private ------
-1
---- 1984 ------
37
---- 1969 ------
52
---- 2016 ------
5
---- 1996 ------
25
---- -1 ------
-1
---- Unknown ------
-1
---- 2004 ------
17
---- 1990 ------
31
---- -1 ------
-1
---- Contract ------
-1
---- -1 ------
-1
---- -1 ------
-1
---- -1 ------
-1
---- 1962 ------
59
---- 1986 ------
35
---- 2012 ------
9
---- 2009 ------
12
---- 1980 ------
41
---- -1 ------
-1
---- 2009 ------
12
---- 1837 ------
184
---- 1980 ----

---- 1811 ------
210
---- 1886 ------
135
---- 1920 ------
101
---- 1876 ------
145
---- 1999 ------
22
---- 2015 ------
6
---- 2011 ------
10
---- -1 ------
-1
---- 2007 ------
14
---- 2011 ------
10
---- 2016 ------
5
---- Company - Private ------
-1
---- 1962 ------
59
---- 1850 ------
171
---- nan ------
-1
---- Nonprofit Organization ------
-1
---- -1 ------
-1
---- 2003 ------
18
---- 1850 ------
171
---- Company - Public ------
-1
---- Company - Private ------
-1
---- -1 ------
-1
---- 1987 ------
34
---- Nonprofit Organization ------
-1
---- 1888 ------
133
---- 1992 ------
29
---- -1 ------
-1
---- 1968 ------
53
---- Company - Private ------
-1
---- -1 ------
-1
---- 1986 ------
35
---- 1976 ------
45
---- 2005 ------
16
---- 2005 ------
16
---- 1960 ------
61
---- Company - Private ------
-1
---- -1 ------
-1
---- 1955 ------
66
---- 1996 ------
25
---- -1 ------
-1
---- 1911 ------
110
---- 1955 ------
66
---- 1955 ------
66
---- -1 ------
-1
---- 1986 ------
35
---- -1 ---

---- -1 ------
-1
---- 1993 ------
28
---- -1 ------
-1
---- 2008 ------
13
---- -1 ------
-1
---- -1 ------
-1
---- 1980 ------
41
---- 1917 ------
104
---- -1 ------
-1
---- -1 ------
-1
---- 2014 ------
7
---- 1994 ------
27
---- 1986 ------
35
---- -1 ------
-1
---- 1975 ------
46
---- Government ------
-1
---- Government ------
-1
---- 1968 ------
53
---- -1 ------
-1
---- 1874 ------
147
---- 1850 ------
171
---- Company - Private ------
-1
---- 1983 ------
38
---- Self-employed ------
-1
---- nan ------
-1
---- Self-employed ------
-1
---- 2000 ------
21
---- 2000 ------
21
---- 2000 ------
21
---- 2000 ------
21
---- 2000 ------
21
---- 2000 ------
21
---- 2008 ------
13
---- 2000 ------
21
---- 2000 ------
21
---- 2000 ------
21
---- 1956 ------
65
---- 2000 ------
21
---- 2000 ------
21
---- 2000 ------
21
---- 2000 ------
21
---- 2000 ------
21
---- 1977 ------
44
---- 2000 ------
21
---- 2010 ------
11
---- 1997 ------
24
---- 1977 ------
44
---- 1977 ------
44
---- 1977 --

---- Company - Private ------
-1
---- -1 ------
-1
---- Company - Public ------
-1
---- 2001 ------
20
---- 1996 ------
25
---- -1 ------
-1
---- Company - Public ------
-1
---- Company - Private ------
-1
---- Company - Private ------
-1
---- 1962 ------
59
---- 1996 ------
25
---- Company - Private ------
-1
---- Company - Private ------
-1
---- 1957 ------
64
---- -1 ------
-1
---- Company - Private ------
-1
---- 1967 ------
54
---- 2011 ------
10
---- 2012 ------
9
---- Company - Private ------
-1
---- Company - Private ------
-1
---- 1984 ------
37
---- 2015 ------
6
---- 1931 ------
90
---- 1996 ------
25
---- 1946 ------
75
---- 2006 ------
15
---- 1977 ------
44
---- Company - Private ------
-1
---- 1994 ------
27
---- 1998 ------
23
---- Company - Private ------
-1
---- Company - Public ------
-1
---- Company - Private ------
-1
---- -1 ------
-1
---- 2001 ------
20
---- 2007 ------
14
---- 2015 ------
6
---- Company - Private ------
-1
---- 1998 ------
23
---- 2012 ------
9


-1
---- 2013 ------
8
---- -1 ------
-1
---- -1 ------
-1
---- 2015 ------
6
---- Company - Private ------
-1
---- 2001 ------
20
---- Nonprofit Organization ------
-1
---- Company - Public ------
-1
---- 1922 ------
99
---- 2009 ------
12
---- 2019 ------
2
---- Company - Private ------
-1
---- Company - Private ------
-1
---- 1977 ------
44
---- Company - Private ------
-1
---- 2000 ------
21
---- -1 ------
-1
---- 1967 ------
54
---- 1976 ------
45
---- 2004 ------
17
---- 2000 ------
21
---- 2014 ------
7
---- 1989 ------
32
---- Company - Private ------
-1
---- 1923 ------
98
---- -1 ------
-1
---- -1 ------
-1
---- 2009 ------
12
---- Company - Private ------
-1
---- 2015 ------
6
---- Company - Private ------
-1
---- 1968 ------
53
---- 1994 ------
27
---- 1967 ------
54
---- Company - Private ------
-1
---- Company - Private ------
-1
---- 2015 ------
6
---- -1 ------
-1
---- -1 ------
-1
---- 2001 ------
20
---- Company - Public ------
-1
---- Company - Private ------
-1
---- 

---- 1988 ------
33
---- 1988 ------
33
---- 1966 ------
55
---- 1929 ------
92
---- 1980 ------
41
---- 1960 ------
61
---- 2015 ------
6
---- 1982 ------
39
---- 1921 ------
100
---- 2016 ------
5
---- 1994 ------
27
---- Company - Private ------
-1
---- Company - Private ------
-1
---- -1 ------
-1
---- 1955 ------
66
---- -1 ------
-1
---- 1958 ------
63
---- 1988 ------
33
---- Company - Private ------
-1
---- Company - Private ------
-1
---- 1977 ------
44
---- 1929 ------
92
---- 1982 ------
39
---- 1965 ------
56
---- 1958 ------
63
---- 1965 ------
56
---- 1999 ------
22
---- 1977 ------
44
---- 1965 ------
56
---- 2015 ------
6
---- 1924 ------
97
---- nan ------
-1
---- Company - Private ------
-1
---- 1947 ------
74
---- Contract ------
-1
---- -1 ------
-1
---- Company - Public ------
-1
---- -1 ------
-1
---- 1994 ------
27
---- Company - Private ------
-1
---- -1 ------
-1
---- 1976 ------
45
---- -1 ------
-1
---- Company - Private ------
-1
---- Company - Private -----

---- 1998 ------
23
---- 2008 ------
13
---- 1899 ------
122
---- -1 ------
-1
---- 1974 ------
47
---- 1849 ------
172
---- -1 ------
-1
---- -1 ------
-1
---- 2010 ------
11
---- 1937 ------
84
---- 1917 ------
104
---- 2015 ------
6
---- 1986 ------
35
---- 1902 ------
119
---- 1924 ------
97
---- 1946 ------
75
---- 1923 ------
98
---- -1 ------
-1
---- 2016 ------
5
---- Company - Private ------
-1
---- 1966 ------
55
---- 1792 ------
229
---- 1916 ------
105
---- 1984 ------
37
---- 1927 ------
94
---- 1992 ------
29
---- 1933 ------
88
---- 1933 ------
88
---- Franchise ------
-1
---- nan ------
-1
---- nan ------
-1
---- 1998 ------
23
---- 1932 ------
89
---- -1 ------
-1
---- Company - Private ------
-1
---- -1 ------
-1
---- 1991 ------
30
---- Company - Public ------
-1
---- 1994 ------
27
---- -1 ------
-1
---- Company - Private ------
-1
---- -1 ------
-1
---- 1980 ------
41
---- 1792 ------
229
---- 1985 ------
36
---- 1983 ------
38
---- 2005 ------
16
---- 2010 ------


-1
---- -1 ------
-1
---- 1999 ------
22
---- 2000 ------
21
---- 2006 ------
15
---- Company - Private ------
-1
---- 1966 ------
55
---- Company - Private ------
-1
---- Company - Private ------
-1
---- 2012 ------
9
---- 1969 ------
52
---- 2000 ------
21
---- 2003 ------
18
---- Unknown ------
-1
---- 2003 ------
18
---- 1997 ------
24
---- Company - Public ------
-1
---- 2003 ------
18
---- 2003 ------
18
---- Company - Private ------
-1
---- Company - Private ------
-1
---- 1989 ------
32
---- Company - Private ------
-1
---- 1939 ------
82
---- 2020 ------
1
---- Company - Private ------
-1
---- -1 ------
-1
---- 1982 ------
39
---- 2010 ------
11
---- Company - Private ------
-1
---- 1908 ------
113
---- 1999 ------
22
---- Company - Public ------
-1
---- 2018 ------
3
---- 2003 ------
18
---- -1 ------
-1
---- 2001 ------
20
---- 1792 ------
229
---- 1969 ------
52
---- 1911 ------
110
---- Company - Public ------
-1
---- 1946 ------
75
---- -1 ------
-1
---- 1792 ------
229
-

0       None
2       None
5       None
7       None
9       None
        ... 
7974    None
7975    None
7977    None
7978    None
7983    None
Name: Founded, Length: 4917, dtype: object

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,hourly,employer_provided,min_salary,max_salary,avg_salary
5617,Assistant Manager $17.50-$18.50/hr Full & Part...,$31K - $59K,Job Summary\nJersey Mike's Subs in Mukilteo is...,3.8,Jersey Mike's Subs - Mukilteo,"Mukilteo, WA",10000+ Employees,1956,1956,Fast-Food & Quick-Service Restaurants,"Restaurants, Bars & Food Services",$5 to $10 million (USD),0,0,31,59,45.0
6291,Senior Credit Analyst,$42K - $48K,SUMMARY\nThe Senior Credit Analyst provides ex...,2.6,The Farmers National Bank of Emlenton,"Reynoldsville, PA",1 to 50 Employees,Subsidiary or Business Segment,Subsidiary or Business Segment,Finance,$25 to $50 million (USD),-1,0,0,42,48,45.0
669,Senior Software Engineer,$79K - $145K,"As a Senior Software Engineer, you will be wor...",-1.0,Oxi Fresh Franchising Co,"Lakewood, CO",-1,-1,-1,-1,-1,-1,0,0,79,145,112.0
3747,Senior Java Developer,$62K - $127K,"Position: Senior Java Developer\nSalary: $90,0...",-1.0,Route1 Inc.,"Boca Raton, FL",-1,-1,-1,-1,-1,-1,0,0,62,127,94.5
2234,Data Analyst,$31K - $70K,FSA TPA POSITION PROFILE\nJOB TITLE: Data Anal...,-1.0,FSA TPA LLC,"Atmore, AL",-1,-1,-1,-1,-1,-1,0,0,31,70,50.5
3987,Android Developer,$76K - $174K,The Android Developer will play a critical rol...,-1.0,ION,"New York, NY",Unknown,Company - Public,Company - Public,-1,-1,-1,0,0,76,174,125.0
5191,Sr. iOS Developer,$70K - $145K,Looking for Sr. IOS developer with 8-10 Years ...,-1.0,"Skytech Services, Inc","Houston, TX",-1,-1,-1,-1,-1,-1,0,0,70,145,107.5
136,Clinical Lab Scientist l (Full-Time),$21 - $34 Per Hour,JOB DESCRIPTION &\nCOMPETENCY EVALUATION\nPOSI...,3.3,Oak Valley Hospital District,"Oakdale, CA",201 to 500 Employees,Hospital,Hospital,Health Care,$50 to $100 million (USD),-1,1,0,21,34,27.5
2692,Business Intelligence Analyst,$55K - $112K,The Business Intelligence Analyst reports dire...,-1.0,Ovation Financial Services,"San Antonio, TX",-1,-1,-1,-1,-1,-1,0,0,55,112,83.5
5949,Subway Sandwich Artist - $12/hr,$20K - $30K,"*$200 Bonus after 90 Days!!\nTo APPLY Text ""Pi...",3.4,Pilot Flying J,"Tremonton, UT",10000+ Employees,1958,1958,Gas Stations,Retail,$10+ billion (USD),0,0,20,30,25.0


Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,hourly,employer_provided,min_salary,max_salary
0,Data Scientist,$74K - $139K,Intro (Use Font Arial 12):\nAs a Data Scientis...,4.4,Applied Information Sciences,"Chevy Chase, MD",501 to 1000 Employees,1982,1982,IT Services,Information Technology,$50 to $100 million (USD),0,0,74000,139000
2,Web Developer,$41K - $83K,We are the fastest growing employer of emergin...,4.0,Revature,"Tampa, FL",1001 to 5000 Employees,2003,2003,IT Services,Information Technology,$100 to $500 million (USD),0,0,41000,83000
5,Data Analyst,$43K - $88K,"HR Data Analyst - Direct Hire - Buffalo, NY\nO...",4.1,Systems Personnel,"Niagara Falls, NY",1 to 50 Employees,1996,1996,Staffing & Outsourcing,Business Services,$5 to $10 million (USD),0,0,43000,88000
7,Database Administrator,$68K - $136K,"Secure our Nation, Ignite your Future\nBecome ...",4.2,ManTech International Corporation,"Fort Meade, MD",5001 to 10000 Employees,1968,1968,Research & Development,Business Services,$1 to $2 billion (USD),0,0,68000,136000
9,iOS Developer,$72K - $104K,Are you an iOS developer who loves to create m...,4.1,Garmin,"Olathe, KS",10000+ Employees,1989,1989,Consumer Products Manufacturing,Manufacturing,$2 to $5 billion (USD),0,0,72000,104000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7977,Lead Mobile Engineer,$79K - $162K,About Us\nWe are a fintech startup with a miss...,-1.0,Treecard,"Austin, TX",-1,-1,-1,-1,-1,-1,0,0,79000,162000
7978,Cloud Infrastructure Engineer (SRE/Java),$61K - $132K,"Every day, Global Payments makes it possible f...",3.5,Global Payments,"Columbus, GA",10000+ Employees,1967,1967,Financial Transaction Processing,Finance,$5 to $10 billion (USD),0,0,61000,132000
7980,Research Scientist/Engineer – Cyber Security,$69K - $135K,IAI is looking for self-motivated Research Sci...,3.8,Intelligent Automation,"Rockville, MD",51 to 200 Employees,1987,1987,"Health, Beauty, & Fitness",Consumer Services,$25 to $50 million (USD),0,0,69000,135000
7982,PEGA Systems Analyst,$50K - $96K,"Secure our Nation, Ignite your Future\nBecome ...",4.2,ManTech International Corporation,"Clarksburg, WV",5001 to 10000 Employees,1968,1968,Research & Development,Business Services,$1 to $2 billion (USD),0,0,50000,96000
