In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
df = pd.read_csv('temp/glassdoor refined.csv')

In [3]:
df.head()

Unnamed: 0,Company,Location,Title,Rating,Salary,Description,Founded,Size,Industry,Sector,Type,Revenue
0,Purchasing Power\n3.2,"Atlanta, GA",Data Scientist,3.2,$66K - $112K (Glassdoor est.),Data Scientist\nLocation\n\n\nAtlanta - Midtow...,2001.0,201 to 500 Employees,Financial Transaction Processing,Finance,Company - Private,$100 to $500 million (USD)
1,Varen Technologies\n4.2,"Annapolis Junction, MD",Data Scientist,4.2,$76K - $111K (Glassdoor est.),"At Varen, our performance is measured by the s...",2005.0,51 to 200 Employees,Enterprise Software & Network Solutions,Information Technology,Company - Private,$25 to $50 million (USD)
2,Big Fish Games\n3.2,"Oakland, CA",Senior Data Scientist,3.2,$109K - $175K (Glassdoor est.),Job Posting Title\nSenior Data Scientist\nSumm...,2002.0,501 to 1000 Employees,Video Games,Media,Subsidiary or Business Segment,$50 to $100 million (USD)
3,GEICO\n3.5,"Chevy Chase, MD",Principal Data Scientist,3.5,$104K - $170K (Glassdoor est.),"Working out of our Chevy Chase, MD/Washington ...",1936.0,10000+ Employees,Insurance Carriers,Insurance,Subsidiary or Business Segment,$10+ billion (USD)
4,Stanley Black & Decker\n3.5,"Fishers, IN",AI Data Scientist,3.5,$74K - $130K (Glassdoor est.),About Stanley Black & Decker\n\nJoining the St...,1843.0,10000+ Employees,Industrial Manufacturing,Manufacturing,Company - Public,$10+ billion (USD)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884 entries, 0 to 883
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      884 non-null    object 
 1   Location     883 non-null    object 
 2   Title        883 non-null    object 
 3   Rating       804 non-null    float64
 4   Salary       884 non-null    object 
 5   Description  883 non-null    object 
 6   Founded      700 non-null    float64
 7   Size         824 non-null    object 
 8   Industry     748 non-null    object 
 9   Sector       748 non-null    object 
 10  Type         824 non-null    object 
 11  Revenue      824 non-null    object 
dtypes: float64(2), object(10)
memory usage: 83.0+ KB


In [5]:
df.duplicated().sum()

0

In [6]:
df.dropna(subset = ['Salary', 'Company', 'Location'], inplace = True)
df.drop_duplicates(inplace = True)
df.reset_index(drop = True, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      883 non-null    object 
 1   Location     883 non-null    object 
 2   Title        883 non-null    object 
 3   Rating       804 non-null    float64
 4   Salary       883 non-null    object 
 5   Description  883 non-null    object 
 6   Founded      700 non-null    float64
 7   Size         824 non-null    object 
 8   Industry     748 non-null    object 
 9   Sector       748 non-null    object 
 10  Type         824 non-null    object 
 11  Revenue      824 non-null    object 
dtypes: float64(2), object(10)
memory usage: 82.9+ KB


## Salary parsing

In [7]:
per_hour = df['Salary'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
employer = df['Salary'].apply(lambda x: 1 if 'employer' in x.lower() else 0)

df['Hourly'] = per_hour
df['Employer Provided'] = employer

In [8]:
def salary_parse(salary):
    salary_split = salary.split('(')[0]
    if salary_split.endswith('r'):
        salary_split = salary.split(' Per')[0].split('-')
        salary_split = [int(i.replace('$', '')) for i in salary_split]
        salary_split = [i * 40 * 50 for i in salary_split]
    else:
        salary_split = salary.split(' (')[0].split(' - ')
        salary_split = [int(i.replace('$', '').replace('K', '')) for i in salary_split]
        salary_split = [i * 1000 for i in salary_split]
    return salary_split

In [9]:
df['Low Salary'] = df['Salary'].apply(salary_parse).apply(lambda x: x[0])
df['High Salary'] = df['Salary'].apply(salary_parse).apply(lambda x: x[1])
df['Average Salary'] = ((df['Low Salary'] + df['High Salary']) / 2).astype(int)
df[['Salary', 'Hourly', 'Employer Provided', 'Low Salary', 'High Salary',\
    'Average Salary']].sample(10)

Unnamed: 0,Salary,Hourly,Employer Provided,Low Salary,High Salary,Average Salary
696,$81K - $145K (Glassdoor est.),0,0,81000,145000,113000
656,$95K - $162K (Glassdoor est.),0,0,95000,162000,128500
453,$55K - $94K (Glassdoor est.),0,0,55000,94000,74500
401,$69K - $117K (Glassdoor est.),0,0,69000,117000,93000
117,$115K - $184K (Glassdoor est.),0,0,115000,184000,149500
305,$79K - $130K (Glassdoor est.),0,0,79000,130000,104500
399,$69K - $117K (Glassdoor est.),0,0,69000,117000,93000
505,$162K - $269K (Glassdoor est.),0,0,162000,269000,215500
17,$81K - $137K (Glassdoor est.),0,0,81000,137000,109000
195,$58K - $105K (Glassdoor est.),0,0,58000,105000,81500


In [10]:
df['Hourly'].value_counts(), df['Employer Provided'].value_counts()

(0    883
 Name: Hourly, dtype: int64,
 0    851
 1     32
 Name: Employer Provided, dtype: int64)

## City & State parsing

In [11]:
for i in df.index:
    if len(df.loc[i,'Location'].split(', ')) == 3:
        df.loc[i, 'City'] = df.loc[i, 'Location'].split(', ')[1]
    elif len(df.loc[i,'Location'].split(', ')) == 2:
        df.loc[i, 'City'] = df.loc[i, 'Location'].split(', ')[0]
    else:
        df.loc[i, 'City'] = np.nan

In [12]:
place = df['Location'].apply(lambda x: x.split(', '))

for i in range(len(place)):
    if len(place.loc[i]) == 1:
        place.loc[i].append(np.nan)

df['State'] = place.apply(lambda x: x[1])

In [13]:
df = df[df['Location'] != 'United States']
df.loc[df['Location'] == 'Remote', 'State'] = 'Remote'
df.loc[df['Location'] == 'Remote', 'City'] = 'Remote'

In [14]:
states = pd.read_csv('temp/us-states.csv', sep = ' - ', engine = 'python', header = None,
                     names = ['State', 'Abbreviation'])

for i in df.index:
    if df.loc[i, 'Location'] in list(states['State'].unique()):
        state_full = df.loc[i, 'Location']
        state_abbrev = list(states[states['State'] == state_full]['Abbreviation'])[0]
        df.loc[i, 'State'] = state_abbrev

In [15]:
df[['Location', 'City', 'State']].sample(10)

Unnamed: 0,Location,City,State
333,"Fort Meade, MD",Fort Meade,MD
480,"New York, NY",New York,NY
642,"Edison, NJ",Edison,NJ
266,"Las Cruces, NM",Las Cruces,NM
537,"Norfolk, VA",Norfolk,VA
84,"San Mateo, CA",San Mateo,CA
371,"San Francisco, CA",San Francisco,CA
872,"Sunnyvale, CA",Sunnyvale,CA
69,Remote,Remote,Remote
57,"San Francisco, CA",San Francisco,CA


In [16]:
df['State'].value_counts()

CA             165
VA             114
NY              68
TX              55
WA              50
MA              43
MD              36
NJ              31
IL              31
NC              25
DC              22
MO              22
PA              22
Remote          20
OH              19
FL              17
CO              16
GA              16
MN              14
MI              12
CT               8
TN               8
AZ               7
KY               6
OR               4
AL               3
SC               3
NM               3
UT               3
RI               2
Los Angeles      2
IN               2
IA               2
KS               2
AR               2
NV               2
WI               2
ID               2
Fulton           1
NH               1
ME               1
NE               1
PR               1
Name: State, dtype: int64

## Company name text only

In [17]:
df['Company'] = df['Company'].apply(lambda x: x.split('\n')[0])
df[['Company', 'Rating']].sample(10)

Unnamed: 0,Company,Rating
669,Klaviyo,4.8
728,Varsity Tutors,4.1
521,"DirectViz Solutions, LLC",3.3
320,Apple,4.3
611,Jobot,4.9
229,Thomas Jefferson University Hospital,3.6
210,Perspecta,3.4
732,"WhirlWind Technologies, LLC",5.0
39,Averna,3.6
200,Point72,3.6


In [18]:
df['Company'].nunique()

585

## Age of company

In [19]:
df['Founded'].fillna(-1, inplace = True)
df['Founded'] = df['Founded'].astype(int)

for i in df.index:
    if df.loc[i, 'Founded'] == -1:
        df.loc[i, 'Age'] = -1
    else:
        df.loc[i, 'Age'] = int(time.localtime()[0]) - df.loc[i, 'Founded']

df['Age'] = df['Age'].astype(int)
df[['Company', 'Founded', 'Age']].sample(10)

Unnamed: 0,Company,Founded,Age
807,Grantham University,1951,70
558,insitro,-1,-1
717,Brooksource,2000,21
341,Pivot Bio,2011,10
260,CKA LLC,-1,-1
522,On-Board Companies,1976,45
690,Zel Technologies,1988,33
330,Equinox Consulting Partners LLC,-1,-1
152,KLA-Tencor,1976,45
824,Astellas Pharmaceuticals,2005,16


## Parsing of job description

In [20]:
df['Python'] = df['Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)
df['R'] = df['Description'].apply(lambda x: 1 if 'rstudio' in x.lower() or 'r studio' in x.lower() else 0)
df['SQL'] = df['Description'].apply(lambda x: 1 if 'sql' in x.lower() else 0)
df['ML'] = df['Description'].apply(lambda x: 1 if 'machine learning' in x.lower() else 0)
df['DL'] = df['Description'].apply(lambda x: 1 if 'deep learning' in x.lower() else 0)
df['Excel'] = df['Description'].apply(lambda x: 1 if 'excel' in x.lower() else 0)
df['Spark'] = df['Description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)
df['AWS'] = df['Description'].apply(lambda x: 1 if 'aws' in x.lower() else 0)
df['BI'] = df['Description'].apply(lambda x: 1 if 'powerbi' in x.lower() or 'power bi' in x.lower() or 'tableau' in x.lower() else 0)

df[['Python', 'R', 'SQL', 'ML', 'DL', 'Excel', 'Spark', 'AWS', 'BI']].sample(20)

Unnamed: 0,Python,R,SQL,ML,DL,Excel,Spark,AWS,BI
15,0,0,0,0,0,0,0,0,0
158,1,0,0,0,0,1,0,0,1
558,0,0,0,1,0,0,0,0,0
491,0,0,0,0,0,1,0,0,0
803,0,0,0,0,0,0,0,0,0
180,0,0,0,0,0,0,0,0,0
529,0,0,0,1,0,0,0,0,0
672,0,0,1,1,0,1,1,1,0
437,0,0,0,1,0,0,0,0,0
714,0,0,0,0,0,0,0,0,0


In [21]:
df['SQL'].value_counts(), df['R'].value_counts(), df['Python'].value_counts(), df['ML'].value_counts(),\
df['DL'].value_counts(), df['Excel'].value_counts(), df['Spark'].value_counts(), df['AWS'].value_counts(),\
df['BI'].value_counts()

(0    765
 1    102
 Name: SQL, dtype: int64,
 0    864
 1      3
 Name: R, dtype: int64,
 0    720
 1    147
 Name: Python, dtype: int64,
 0    582
 1    285
 Name: ML, dtype: int64,
 0    837
 1     30
 Name: DL, dtype: int64,
 0    743
 1    124
 Name: Excel, dtype: int64,
 0    819
 1     48
 Name: Spark, dtype: int64,
 0    802
 1     65
 Name: AWS, dtype: int64,
 0    828
 1     39
 Name: BI, dtype: int64)

In [22]:
df.to_csv('temp/glassdoor cleaned', index = False)

In [23]:
df = pd.read_csv('temp/glassdoor cleaned')
df.sample(10)

Unnamed: 0,Company,Location,Title,Rating,Salary,Description,Founded,Size,Industry,Sector,...,Age,Python,R,SQL,ML,DL,Excel,Spark,AWS,BI
282,Facebook,"Sunnyvale, CA","Data Scientist, Analytics - Business Payments",4.5,$65K - $110K (Glassdoor est.),Facebook's mission is to give people the power...,2004,10000+ Employees,Internet,Information Technology,...,17,0,0,0,0,0,0,0,0,0
686,IHME,"Seattle, WA",Data Analyst - Neonatal and Child Health,2.9,$35K - $66K (Glassdoor est.),"January 20, 2021\n\nThe Institute for Health M...",2007,201 to 500 Employees,Research & Development,Business Services,...,14,1,0,0,0,0,0,0,0,0
367,UnitedHealth Group,"Minnetonka, MN","Sr Principal Data Scientist - Minnetonka, MN",3.5,$123K - $195K (Glassdoor est.),If you want to achieve more in your mission of...,1977,10000+ Employees,Health Care Services & Hospitals,Health Care,...,44,0,0,0,1,0,0,0,0,0
602,BAT,"Winston-Salem, NC","Scientist, Clinical Data Manager",4.4,$108K - $175K (Glassdoor est.),Job Number: 29829\n\nReynolds American Incorpo...,1902,10000+ Employees,Consumer Products Manufacturing,Manufacturing,...,119,0,0,0,0,0,0,0,0,0
839,Walmart,"Bentonville, AR",Data Scientist - Sam's Club,3.3,$74K - $126K (Glassdoor est.),Position Summary...\nWhat you'll do...\nData S...,1962,1001 to 5000 Employees,"Department, Clothing, & Shoe Stores",Retail,...,59,1,0,0,1,0,1,1,0,0
561,Galvanize,"New York, NY","Associate Data Science Instructor, Data Scientist",,$64K - $112K (Glassdoor est.),This is a remote based position that can be wo...,-1,,,,...,-1,1,0,1,1,0,0,0,0,0
412,Apple,"Seattle, WA","AI/ML - Data Engineer, Siri Search & Knowledge...",4.3,$65K - $102K (Glassdoor est.),"Posted: Jan 19, 2021\nWeekly Hours: 40\nRole N...",1976,10000+ Employees,Computer Hardware & Software,Information Technology,...,45,1,0,0,0,0,1,0,0,0
433,J.P. Morgan,"Plano, TX",CIB Wholesale Payments Data and Analytics - Da...,4.0,$65K - $102K (Glassdoor est.),J.P. Morgan's Corporate & Investment Bank (CIB...,1799,10000+ Employees,Investment Banking & Asset Management,Finance,...,222,1,0,1,1,0,1,1,0,1
534,"Jewelers Mutual Insurance Company, SI","Dallas, TX",Sr. Data Scientist,,$110K - $171K (Glassdoor est.),"Jewelers Mutual, the nation's only company exc...",-1,,,,...,-1,0,0,0,0,0,0,0,0,0
576,Root Insurance Company,"Columbus, OH",Senior Data Scientist,4.0,$64K - $112K (Glassdoor est.),We believe a large part of building an effecti...,2015,501 to 1000 Employees,Insurance Carriers,Insurance,...,6,0,0,0,1,0,0,0,0,0
