In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('temp/glassdoor data scientist salary.csv', na_values = '-1')

In [3]:
df.head()

Unnamed: 0,Company,Location,Title,Rating,Salary,Description,Founded,Size,Industry,Sector,Type,Revenue
0,Big Fish Games\n3.2,"Oakland, CA",Senior Data Scientist,3.2,$109K - $175K (Glassdoor est.),Job Posting Title\nSenior Data Scientist\nSumm...,2002.0,501 to 1000 Employees,Video Games,Media,Subsidiary or Business Segment,$50 to $100 million (USD)
1,GEICO\n3.5,"Chevy Chase, MD",Principal Data Scientist,3.5,$104K - $170K (Glassdoor est.),"Working out of our Chevy Chase, MD/Washington ...",1936.0,10000+ Employees,Insurance Carriers,Insurance,Subsidiary or Business Segment,$10+ billion (USD)
2,Nitto Denko Avecia\n2.9,"Marlborough, MA",AD Research Scientist-Level TBD Virtual Hiring...,2.9,,"Nitto Denko Avecia, Inc. Virtual Hiring Event\...",,201 to 500 Employees,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Company - Private,$10+ billion (USD)
3,Schreiber Foods\n3.7,"Green Bay, WI",Research Scientist,3.7,$54K - $114K (Glassdoor est.),*\nWork as part of the US/LATAM Product Develo...,1945.0,5001 to 10000 Employees,Food & Beverage Manufacturing,Manufacturing,Company - Private,$2 to $5 billion (USD)
4,IsoPlexis\n3.7,"Branford, CT",Senior Scientist,3.7,,Senior Scientist – Molecular Biology\nCompany ...,,1 to 50 Employees,,,Company - Private,Less than $1 million (USD)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      99 non-null     object 
 1   Location     99 non-null     object 
 2   Title        99 non-null     object 
 3   Rating       89 non-null     float64
 4   Salary       88 non-null     object 
 5   Description  99 non-null     object 
 6   Founded      77 non-null     float64
 7   Size         91 non-null     object 
 8   Industry     87 non-null     object 
 9   Sector       87 non-null     object 
 10  Type         91 non-null     object 
 11  Revenue      91 non-null     object 
dtypes: float64(2), object(10)
memory usage: 9.5+ KB


## Salary parsing

In [5]:
df.dropna(subset = ['Salary', 'Company'], inplace = True)
df.reset_index(drop = True, inplace = True)

In [6]:
per_hour = df['Salary'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
employer = df['Salary'].apply(lambda x: 1 if 'employer' in x.lower() else 0)

df['Hourly'] = per_hour
df['Employer Provided'] = employer

In [7]:
def salary_parse(salary):
    salary_split = salary.split('(')[0]
    if salary_split.endswith('r'):
        salary_split = salary.split(' Per')[0].split('-')
        salary_split = [int(i.replace('$', '')) for i in salary_split]
        salary_split = [i * 40 * 50 for i in salary_split]
    else:
        salary_split = salary.split(' (')[0].split(' - ')
        salary_split = [int(i.replace('$', '').replace('K', '')) for i in salary_split]
        salary_split = [i * 1000 for i in salary_split]
    return salary_split

In [8]:
df['Low Salary'] = df['Salary'].apply(salary_parse).apply(lambda x: x[0])
df['High Salary'] = df['Salary'].apply(salary_parse).apply(lambda x: x[1])
df['Average Salary'] = ((df['Low Salary'] + df['High Salary']) / 2).astype(int)
df[['Salary', 'Hourly', 'Employer Provided', 'Low Salary', 'High Salary',\
    'Average Salary']].sample(10)

Unnamed: 0,Salary,Hourly,Employer Provided,Low Salary,High Salary,Average Salary
84,$65K - $108K (Glassdoor est.),0,0,65000,108000,86500
25,$89K - $145K (Glassdoor est.),0,0,89000,145000,117000
51,$69K - $116K (Glassdoor est.),0,0,69000,116000,92500
69,$69K - $116K (Glassdoor est.),0,0,69000,116000,92500
53,$69K - $116K (Glassdoor est.),0,0,69000,116000,92500
32,$89K - $145K (Glassdoor est.),0,0,89000,145000,117000
65,$69K - $116K (Glassdoor est.),0,0,69000,116000,92500
82,$65K - $108K (Glassdoor est.),0,0,65000,108000,86500
76,$69K - $116K (Glassdoor est.),0,0,69000,116000,92500
7,$18 - $33 Per Hour(Glassdoor est.),1,0,36000,66000,51000


## City & State parsing

In [9]:
for i in df.index:
    if len(df.loc[i,'Location'].split(', ')) == 3:
        df.loc[i, 'City'] = df.loc[i, 'Location'].split(', ')[1]
    elif len(df.loc[i,'Location'].split(', ')) == 2:
        df.loc[i, 'City'] = df.loc[i, 'Location'].split(', ')[0]
    else:
        df.loc[i, 'City'] = np.nan

In [10]:
place = df['Location'].apply(lambda x: x.split(', '))

for i in range(len(place)):
    if len(place.loc[i]) == 1:
        place.loc[i].append(np.nan)

df['State'] = place.apply(lambda x: x[1])

In [11]:
df = df[df['Location'] != 'United States']
df.loc[df['Location'] == 'Remote', 'State'] = 'Remote'
df.loc[df['Location'] == 'Remote', 'City'] = 'Remote'

In [12]:
states = pd.read_csv('temp/us-states.csv', sep = ' - ', engine = 'python', header = None,
                     names = ['State', 'Abbreviation'])

for i in df.index:
    if df.loc[i, 'Location'] in list(states['State'].unique()):
        state_full = df.loc[i, 'Location']
        state_abbrev = list(states[states['State'] == state_full]['Abbreviation'])[0]
        df.loc[i, 'State'] = state_abbrev

In [13]:
df[['Location', 'City', 'State']].sample(10)

Unnamed: 0,Location,City,State
47,"Seattle, WA",Seattle,WA
72,"Boston, MA",Boston,MA
19,"Austin, TX",Austin,TX
9,"Ipswich, MA",Ipswich,MA
15,"Allegan, MI",Allegan,MI
41,"Boston, MA",Boston,MA
31,"Las Vegas, NV",Las Vegas,NV
14,"Jacksonville, FL",Jacksonville,FL
49,"Boston, MA",Boston,MA
67,"Englewood, CO",Englewood,CO


## Company name text only

## Age of company

## Parsing of job description