In [1]:
import pandas as pd
import numpy as np
import time
import re

In [2]:
df = pd.read_csv('temp/glassdoor refined.csv')

In [3]:
df.head()

Unnamed: 0,Company,Location,Title,Rating,Salary,Description,Founded,Size,Industry,Sector,Type,Revenue
0,Purchasing Power\n3.2,"Atlanta, GA",Data Scientist,3.2,$66K - $112K (Glassdoor est.),Data Scientist\nLocation\n\n\nAtlanta - Midtow...,2001.0,201 to 500 Employees,Financial Transaction Processing,Finance,Company - Private,$100 to $500 million (USD)
1,Varen Technologies\n4.2,"Annapolis Junction, MD",Data Scientist,4.2,$76K - $111K (Glassdoor est.),"At Varen, our performance is measured by the s...",2005.0,51 to 200 Employees,Enterprise Software & Network Solutions,Information Technology,Company - Private,$25 to $50 million (USD)
2,Big Fish Games\n3.2,"Oakland, CA",Senior Data Scientist,3.2,$109K - $175K (Glassdoor est.),Job Posting Title\nSenior Data Scientist\nSumm...,2002.0,501 to 1000 Employees,Video Games,Media,Subsidiary or Business Segment,$50 to $100 million (USD)
3,GEICO\n3.5,"Chevy Chase, MD",Principal Data Scientist,3.5,$104K - $170K (Glassdoor est.),"Working out of our Chevy Chase, MD/Washington ...",1936.0,10000+ Employees,Insurance Carriers,Insurance,Subsidiary or Business Segment,$10+ billion (USD)
4,Stanley Black & Decker\n3.5,"Fishers, IN",AI Data Scientist,3.5,$74K - $130K (Glassdoor est.),About Stanley Black & Decker\n\nJoining the St...,1843.0,10000+ Employees,Industrial Manufacturing,Manufacturing,Company - Public,$10+ billion (USD)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884 entries, 0 to 883
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      884 non-null    object 
 1   Location     883 non-null    object 
 2   Title        883 non-null    object 
 3   Rating       804 non-null    float64
 4   Salary       884 non-null    object 
 5   Description  883 non-null    object 
 6   Founded      700 non-null    float64
 7   Size         824 non-null    object 
 8   Industry     748 non-null    object 
 9   Sector       748 non-null    object 
 10  Type         824 non-null    object 
 11  Revenue      824 non-null    object 
dtypes: float64(2), object(10)
memory usage: 83.0+ KB


In [5]:
df.duplicated().sum()

0

In [6]:
df.dropna(subset = ['Salary', 'Company', 'Location'], inplace = True)
df.drop_duplicates(inplace = True)
df.reset_index(drop = True, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      883 non-null    object 
 1   Location     883 non-null    object 
 2   Title        883 non-null    object 
 3   Rating       804 non-null    float64
 4   Salary       883 non-null    object 
 5   Description  883 non-null    object 
 6   Founded      700 non-null    float64
 7   Size         824 non-null    object 
 8   Industry     748 non-null    object 
 9   Sector       748 non-null    object 
 10  Type         824 non-null    object 
 11  Revenue      824 non-null    object 
dtypes: float64(2), object(10)
memory usage: 82.9+ KB


## Salary parsing

In [7]:
per_hour = df['Salary'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
employer = df['Salary'].apply(lambda x: 1 if 'employer' in x.lower() else 0)

df['Hourly'] = per_hour
df['Employer Provided'] = employer

In [8]:
def salary_parse(salary):
    salary_split = salary.split('(')[0]
    if salary_split.endswith('r'):
        salary_split = salary.split(' Per')[0].split('-')
        salary_split = [int(i.replace('$', '')) for i in salary_split]
        salary_split = [i * 40 * 50 for i in salary_split]
    else:
        salary_split = salary.split(' (')[0].split(' - ')
        salary_split = [int(i.replace('$', '').replace('K', '')) for i in salary_split]
        salary_split = [i * 1000 for i in salary_split]
    return salary_split

In [9]:
df['Low Salary'] = df['Salary'].apply(salary_parse).apply(lambda x: x[0])
df['High Salary'] = df['Salary'].apply(salary_parse).apply(lambda x: x[1])
df['Average Salary'] = ((df['Low Salary'] + df['High Salary']) / 2).astype(int)
df[['Salary', 'Hourly', 'Employer Provided', 'Low Salary', 'High Salary',\
    'Average Salary']].sample(10)

Unnamed: 0,Salary,Hourly,Employer Provided,Low Salary,High Salary,Average Salary
117,$115K - $184K (Glassdoor est.),0,0,115000,184000,149500
425,$65K - $102K (Glassdoor est.),0,0,65000,102000,83500
462,$55K - $94K (Glassdoor est.),0,0,55000,94000,74500
317,$79K - $130K (Glassdoor est.),0,0,79000,130000,104500
377,$123K - $195K (Glassdoor est.),0,0,123000,195000,159000
835,$66K - $111K (Glassdoor est.),0,0,66000,111000,88500
766,$74K - $127K (Glassdoor est.),0,0,74000,127000,100500
432,$65K - $102K (Glassdoor est.),0,0,65000,102000,83500
405,$69K - $117K (Glassdoor est.),0,0,69000,117000,93000
556,$110K - $171K (Glassdoor est.),0,0,110000,171000,140500


In [10]:
df['Hourly'].value_counts(), df['Employer Provided'].value_counts()

(0    883
 Name: Hourly, dtype: int64,
 0    851
 1     32
 Name: Employer Provided, dtype: int64)

## City & State parsing

In [11]:
for i in df.index:
    if len(df.loc[i,'Location'].split(', ')) == 3:
        df.loc[i, 'City'] = df.loc[i, 'Location'].split(', ')[1]
    elif len(df.loc[i,'Location'].split(', ')) == 2:
        df.loc[i, 'City'] = df.loc[i, 'Location'].split(', ')[0]
    else:
        df.loc[i, 'City'] = np.nan

In [12]:
place = df['Location'].apply(lambda x: x.split(', '))

for i in range(len(place)):
    if len(place.loc[i]) == 1:
        place.loc[i].append(np.nan)

df['State'] = place.apply(lambda x: x[1])

In [13]:
df = df[df['Location'] != 'United States']
df.loc[df['Location'] == 'Remote', 'State'] = 'Remote'
df.loc[df['Location'] == 'Remote', 'City'] = 'Remote'

In [14]:
states = pd.read_csv('temp/us-states.csv', sep = ' - ', engine = 'python', header = None,
                     names = ['State', 'Abbreviation'])

for i in df.index:
    if df.loc[i, 'Location'] in list(states['State'].unique()):
        state_full = df.loc[i, 'Location']
        state_abbrev = list(states[states['State'] == state_full]['Abbreviation'])[0]
        df.loc[i, 'State'] = state_abbrev

In [15]:
df[['Location', 'City', 'State']].sample(10)

Unnamed: 0,Location,City,State
92,"Denton, TX",Denton,TX
171,"Arlington, VA",Arlington,VA
405,"Huntington Beach, CA",Huntington Beach,CA
651,"Richmond, VA",Richmond,VA
684,"Seattle, WA",Seattle,WA
767,"Chantilly, VA",Chantilly,VA
543,"Bedford, MA",Bedford,MA
518,"Austin, TX",Austin,TX
5,"Rogers, AR",Rogers,AR
152,"Ann Arbor, MI",Ann Arbor,MI


In [16]:
df['State'].value_counts()

CA             165
VA             114
NY              68
TX              55
WA              50
MA              43
MD              36
NJ              31
IL              31
NC              25
MO              22
DC              22
PA              22
Remote          20
OH              19
FL              17
CO              16
GA              16
MN              14
MI              12
CT               8
TN               8
AZ               7
KY               6
OR               4
NM               3
UT               3
AL               3
SC               3
WI               2
IA               2
IN               2
ID               2
NV               2
KS               2
Los Angeles      2
RI               2
AR               2
PR               1
ME               1
NE               1
NH               1
Fulton           1
Name: State, dtype: int64

## Company name text only

In [17]:
df['Company'] = df['Company'].apply(lambda x: x.split('\n')[0])
df[['Company', 'Rating']].sample(10)

Unnamed: 0,Company,Rating
10,NCCI Holdings,4.3
231,Intuit - Data,4.3
101,NEXTracker,4.3
442,Rrc Companies,3.7
52,Datamatics Global Services Inc,3.1
580,Trigyn,3.8
300,"Latitude, Inc.",4.0
18,Elevate Credit,3.8
224,Berkeley Lights,4.3
237,PenFed Credit Union,3.4


In [18]:
df['Company'].nunique()

585

## Age of company

In [19]:
df['Founded'].fillna(-1, inplace = True)
df['Founded'] = df['Founded'].astype(int)

for i in df.index:
    if df.loc[i, 'Founded'] == -1:
        df.loc[i, 'Age'] = -1
    else:
        df.loc[i, 'Age'] = int(time.localtime()[0]) - df.loc[i, 'Founded']

df['Age'] = df['Age'].astype(int)
df[['Company', 'Founded', 'Age']].sample(10)

Unnamed: 0,Company,Founded,Age
226,Amazon.com Services LLC,1994,27
719,Center for Applied Linguistics,-1,-1
873,Syneos Health Commercial Solutions,2017,4
881,State Farm,1922,99
860,IRI,1979,42
507,ASRC Federal Holding Company,2003,18
263,"Beacon Hill Staffing Group, LLC",2000,21
496,OPPORTUNITY FUND COMMUNITY DEVELOPMENT,1994,27
140,Compass Systems & Programming,-1,-1
262,Aurora,-1,-1


## Parsing of job description

In [20]:
jobs = {'Python' : 'python', 'R' : 'r[ |-]*studio', 'SQL' : 'sql', 'ML' : 'machine learning', 'DL' : 'deep learning',\
        'Excel' : 'excel', 'Spark' : 'spark', 'AWS' : 'aws', 'BI' : r'power[ |-]*bi|tableau'}

jobs_match = []
for key, value in jobs.items():
    jobs_match.append(re.compile(jobs[key]))
df
for i in df.index:
    j = 0
    while j < len(jobs.keys()):
        for key,value in jobs.items():
            if jobs_match[j].search(df.loc[i, 'Description'].lower()):
                df.loc[i,key] = 1
            else:
                df.loc[i,key] = 0
            j += 1

cols = jobs.keys()
for i in cols:
    df[i] = df[i].astype(int)

df[['Description', 'Python', 'R', 'SQL', 'ML', 'DL', 'Excel', 'Spark', 'AWS', 'BI']].sample(10)

Unnamed: 0,Description,Python,R,SQL,ML,DL,Excel,Spark,AWS,BI
631,Remote working arrangements are possible for t...,0,0,0,0,0,0,0,0,0
314,You'll be an important part of our high-energy...,0,0,0,0,0,1,0,0,0
844,Join the people helping people.\nFor people dr...,0,0,0,0,0,0,0,0,0
596,The Trade Desk is a global technology company ...,0,0,0,0,0,0,0,0,0
430,Job Requisition ID: 16474\nAdditional Location...,0,0,0,1,1,0,0,0,0
492,The desired candidate will be located in any o...,0,0,0,0,0,0,0,0,0
690,JOB SUMMARY:\n\nThis position provides the Uni...,0,0,0,1,0,0,0,0,0
4,About Stanley Black & Decker\n\nJoining the St...,0,0,0,1,0,0,0,0,0
436,JOB SUMMARY:\n\nThe Data Analyst will report t...,0,0,0,0,0,0,0,0,0
825,Bachelor's Degree\n3+ years of experience with...,1,0,1,0,0,0,0,0,0


In [21]:
df.to_csv('temp/glassdoor cleaned', index = False)

In [22]:
df = pd.read_csv('temp/glassdoor cleaned')
df.sample(10)

Unnamed: 0,Company,Location,Title,Rating,Salary,Description,Founded,Size,Industry,Sector,...,Age,Python,R,SQL,ML,DL,Excel,Spark,AWS,BI
288,Altair Engineering,"Dearborn, MI",Data Scientists Decisions Science Support,4.2,$65K - $110K (Glassdoor est.),Transforming the Future with Convergence of Si...,1985,1001 to 5000 Employees,Computer Hardware & Software,Information Technology,...,36,0,0,0,0,0,0,0,0,0
173,BeaconFire Solution,"Princeton, NJ",Big Data Developer/ Data Engineer/ Data scientist,4.4,$140K - $150K (Employer est.),JOB RESPONSIBILITIES:\n· Developing complex SQ...,-1,51 to 200 Employees,IT Services,Information Technology,...,-1,1,0,1,0,0,0,1,0,0
794,Supplyframe,"Los Angeles, CA",Data Scientist,3.8,$53K - $97K (Glassdoor est.),"Summary\nWe are a fast-paced, LGBTQ+ and minor...",2003,1 to 50 Employees,Advertising & Marketing,Business Services,...,18,0,0,0,1,0,0,0,0,0
92,Technamo LLC,"Richmond, VA",Data Scientist,,$84K - $138K (Glassdoor est.),"Company Description\n\nTECHNAMO is a niche, yo...",-1,Unknown,,,...,-1,1,0,0,1,1,0,0,0,1
597,NYC Data Science Academy,"New York, NY",Data Scientist/Data Science Instructor (Part-t...,4.6,$108K - $175K (Glassdoor est.),About Us\n\nJust as the field of Data Science ...,2014,1 to 50 Employees,Education Training Services,Education,...,7,0,0,0,1,0,0,0,0,0
537,Global Atlantic Financial Group Opportunities,Remote,"AVP, Data Scientist (Virtual / Remote - Greate...",,$110K - $171K (Glassdoor est.),**We are still hiring and interviewing for thi...,-1,,,,...,-1,1,0,1,1,0,0,0,0,0
264,Baker Hughes,"Minden, NV",Data Science Engineer/Scientist,3.6,$58K - $99K (Glassdoor est.),Role Summary:\nThe Data Science Engineer will ...,-1,10000+ Employees,Oil & Gas Services,"Oil, Gas, Energy & Utilities",...,-1,0,0,0,1,0,0,0,0,0
643,"ArcherDX, Inc.","Boulder, CO",Data Analyst (ADX-244-20),3.4,$95K - $162K (Glassdoor est.),Data Analyst (ADX-244-20)\n\nCOMPANY SUMMARY:\...,2013,201 to 500 Employees,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,...,8,0,0,0,0,0,0,0,0,0
8,Farm Credit Mid-America,"Louisville, KY",Data Scientist,4.1,$87K - $142K (Glassdoor est.),"At Farm Credit Mid-America, we believe people ...",1985,1001 to 5000 Employees,Lending,Finance,...,36,0,0,1,1,0,0,0,0,0
44,Nielsen,"New York, NY",Data Scientist,3.7,$91K - $148K (Glassdoor est.),Data Scientist\n\nApply now »\nApply now\n\n\n...,1923,10000+ Employees,Enterprise Software & Network Solutions,Information Technology,...,98,0,0,0,0,0,0,0,0,0
