In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('temp/glassdoor data scientist salary.csv', na_values = '-1')

In [3]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Data Scientist - Intermediate,,KellyMitchell matches the best IT and business...,4.1,KellyMitchell\n4.1,"Saint Louis, MO",1001 to 5000 Employees,1998.0,Company - Private,Staffing & Outsourcing,Business Services,$50 to $100 million (USD)
1,Data Scientist,$87K - $142K (Glassdoor est.),"At Farm Credit Mid-America, we believe people ...",4.1,Farm Credit Mid-America\n4.1,"Louisville, KY",1001 to 5000 Employees,1985.0,Company - Private,Lending,Finance,$100 to $500 million (USD)
2,Research Scientist,,Job Description:\nOur Reston office is seeking...,3.0,"Metron, Inc.\n3.0","Reston, VA",51 to 200 Employees,,Company - Private,Electrical & Electronic Manufacturing,Manufacturing,$25 to $50 million (USD)
3,Junior Data Scientist,,"Chubb is seeking a creative, innovative, and c...",,WhiteHat,"New York, NY",,,,,,
4,Associate Clinical Data Scientist (Healthcare ...,$58K - $98K (Glassdoor est.),Job Summary\nThe Associate Clinical Data Scien...,2.6,RadNet\n2.6,"Cambridge, MA",5001 to 10000 Employees,1984.0,Company - Public,Health Care Services & Hospitals,Health Care,$500 million to $1 billion (USD)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          2500 non-null   object 
 1   Salary Estimate    1923 non-null   object 
 2   Job Description    2499 non-null   object 
 3   Rating             2272 non-null   float64
 4   Company Name       2500 non-null   object 
 5   Location           2500 non-null   object 
 6   Size               2178 non-null   object 
 7   Founded            1891 non-null   float64
 8   Type of ownership  2178 non-null   object 
 9   Industry           2065 non-null   object 
 10  Sector             2065 non-null   object 
 11  Revenue            2176 non-null   object 
dtypes: float64(2), object(10)
memory usage: 234.5+ KB


## Salary parsing

In [5]:
df.dropna(subset = ['Salary Estimate'], inplace = True)
df.reset_index(drop = True, inplace = True)

In [6]:
per_hour = df['Salary Estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
employer = df['Salary Estimate'].apply(lambda x: 1 if 'employer' in x.lower() else 0)

df['Per Hour'] = per_hour
df['Employer Provided'] = employer

In [7]:
def salary_parse(salary):
    salary_split = salary.split('(')[0]
    if salary_split.endswith('r'):
        salary_split = salary.split(' Per')[0].split('-')
        salary_split = [int(i.replace('$', '')) for i in salary_split]
        salary_split = [i * 40 * 50 for i in salary_split]
    else:
        salary_split = salary.split(' (')[0].split(' - ')
        salary_split = [int(i.replace('$', '').replace('K', '')) for i in salary_split]
        salary_split = [i * 1000 for i in salary_split]
    return salary_split

In [8]:
df['Salary Lower'] = df['Salary Estimate'].apply(salary_parse).apply(lambda x: x[0])
df['Salary Higher'] = df['Salary Estimate'].apply(salary_parse).apply(lambda x: x[1])
df['Salary Average'] = ((df['Salary Lower'] + df['Salary Higher']) / 2).astype(int)
df[['Salary Estimate', 'Per Hour', 'Employer Provided', 'Salary Lower', 'Salary Higher',\
    'Salary Average']].sample(10)

Unnamed: 0,Salary Estimate,Per Hour,Employer Provided,Salary Lower,Salary Higher,Salary Average
1306,$21 - $35 Per Hour(Glassdoor est.),1,0,42000,70000,56000
1292,$77K - $124K (Glassdoor est.),0,0,77000,124000,100500
456,$96K - $157K (Glassdoor est.),0,0,96000,157000,126500
945,$50K - $103K (Glassdoor est.),0,0,50000,103000,76500
1290,$78K - $129K (Glassdoor est.),0,0,78000,129000,103500
1055,$50K - $103K (Glassdoor est.),0,0,50000,103000,76500
1830,$85K - $144K (Glassdoor est.),0,0,85000,144000,114500
653,$10 - $26 Per Hour(Glassdoor est.),1,0,20000,52000,36000
758,$11 - $32 Per Hour(Glassdoor est.),1,0,22000,64000,43000
1242,$77K - $124K (Glassdoor est.),0,0,77000,124000,100500


## State field

In [9]:
for i in df.index:
    if len(df.loc[i,'Location'].split(', ')) == 3:
        df.loc[i, 'City'] = df.loc[i, 'Location'].split(', ')[1]
    elif len(df.loc[i,'Location'].split(', ')) == 2:
        df.loc[i, 'City'] = df.loc[i, 'Location'].split(', ')[0]
    else:
        df.loc[i, 'City'] = np.nan

In [10]:
place = df['Location'].apply(lambda x: x.split(', '))

for i in range(len(place)):
    if len(place.loc[i]) == 1:
        place.loc[i].append(np.nan)

df['State'] = place.apply(lambda x: x[1])

In [11]:
df = df[df['Location'] != 'United States']
df.loc[df['Location'] == 'Remote', 'State'] = 'Remote'
df.loc[df['Location'] == 'Remote', 'City'] = 'Remote'

In [12]:
states = pd.read_csv('temp/us-states.csv', sep = ' - ', engine = 'python', header = None,
                     names = ['State', 'Abbreviation'])

for i in df.index:
    if df.loc[i, 'Location'] in list(states['State'].unique()):
        state_full = df.loc[i, 'Location']
        state_abbrev = list(states[states['State'] == state_full]['Abbreviation'])[0]
        df.loc[i, 'State'] = state_abbrev

In [13]:
df[['Location', 'City', 'State']].sample(10)

Unnamed: 0,Location,City,State
1062,"Austin, TX",Austin,TX
172,"Upton, NY",Upton,NY
1370,"Omaha, NE",Omaha,NE
898,"Milwaukee, WI",Milwaukee,WI
80,"Fort Walton Beach, FL",Fort Walton Beach,FL
1065,"Miami, FL",Miami,FL
1575,"Santa Clara, CA",Santa Clara,CA
315,"Boston, MA",Boston,MA
1046,"Seattle, WA",Seattle,WA
671,"Raleigh, NC",Raleigh,NC


## Company name text only

## Age of company

## Parsing of job description