In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in the data from a JSON file
df = pd.read_json('indeed-data-jobs-FINAL.json')
df.sample(5)

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description
266,DATA ANALYST- REMOTE WORK FROM HOME,Palco,"Knoxville, TN 37901",1,,5 days ago,2021-03-05,DATA ANALYST The Data Analyst is responsible f...
171,"Data Scientist, Analytics - Content",Calm,Remote,1,,13 days ago,2021-03-05,Mission We are hiring a data scientist to be ...
432,Entry-Level Business Analyst,Labatt Food Service,"San Antonio, TX 78218",0,,30+ days ago,2021-03-05,We are offering full-time positions to highly-...
812,Compiler Software Engineer - Machine Learning,Xilinx,"San Jose, CA 95124",0,,9 days ago,2021-03-05,"Description At Xilinx, we are leading the indu..."
644,Data Engineer/Data Scientist : 21-00872,Akraya Inc.,"Palo Alto, CA 94304",1,,2 days ago,2021-03-05,"Primary Skills: Data Analytics, Apache Spark, ..."


In [3]:
# these were probably removed so I'll remove them from the data
df[df.job_description == '']

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description
303,,,,0,,,2021-03-05,
463,,,,0,,,2021-03-05,
586,,,,0,,,2021-03-05,
755,,,,0,,,2021-03-05,


In [4]:
# REMOVED ROWS ABOVE
df = df[df.job_description != '']

In [5]:
# rows, columns
df.shape

(898, 8)

In [6]:
# data summary
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 898 entries, 0 to 901
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_title        898 non-null    object
 1   company          898 non-null    object
 2   location         898 non-null    object
 3   is_remote        898 non-null    int64 
 4   salary           898 non-null    object
 5   post_date        898 non-null    object
 6   date_accessed    898 non-null    object
 7   job_description  898 non-null    object
dtypes: int64(1), object(7)
memory usage: 63.1+ KB


# Creating Labels
### I'll need to create a script that uses the job title to create the label
I already see that data entry isn't what I'm looking for so I'll need to iterate over all entries to purge jobs like this that don't fall into 1 of my 4 categories

In [7]:
df.job_title

0                                      Data Entry
1                                  Data Scientist
2                                  Data Scientist
3                                  Data Scientist
4      Early Career Data Scientist - Applied Math
                          ...                    
897      Machine Learning Engineer, Ads Relevance
898                 Machine Learning: AI Engineer
899                     Machine Learning Engineer
900             Applied Machine Learning Engineer
901                     Machine Learning Engineer
Name: job_title, Length: 898, dtype: object

In [8]:
pd.DataFrame(df.loc[0]).T

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description
0,Data Entry,TransPerfect Translations Inc.,Remote,1,,6 days ago,2021-03-05,We are looking for Data Entry Clerks to join o...


In [9]:
### Regex

In [10]:
df

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description
0,Data Entry,TransPerfect Translations Inc.,Remote,1,,6 days ago,2021-03-05,We are looking for Data Entry Clerks to join o...
1,Data Scientist,ForMotiv,Remote,1,"$75,000 - $120,000 a year",30+ days ago,2021-03-05,Has it ever occurred to you that as the Intern...
2,Data Scientist,Redzara.com,Remote,1,$35 - $80 an hour,10 days ago,2021-03-05,Only GC / EAD only. No C2CBackground screening...
3,Data Scientist,Nova Collective,Remote,1,$35 - $48 an hour,24 days ago,2021-03-05,Are you a data scientist who is really excited...
4,Early Career Data Scientist - Applied Math,Pacific Northwest National Laboratory,"Seattle, WA",0,,1 day ago,2021-03-05,Organization and Job ID Job ID: 311747 Directo...
...,...,...,...,...,...,...,...,...
897,"Machine Learning Engineer, Ads Relevance",Pinterest,"Palo Alto, CA",0,,24 days ago,2021-03-05,About Pinterest: Millions of people across th...
898,Machine Learning: AI Engineer,"ALTRON, Inc.","Fairfax, VA",0,,30+ days ago,2021-03-05,Altron Inc. is seeking a Machine Learning (ML)...
899,Machine Learning Engineer,Global Payments,"Alpharetta, GA 30022",0,,24 days ago,2021-03-05,"Every day, Global Payments makes it possible f..."
900,Applied Machine Learning Engineer,Anomaly,"New York, NY 10012",0,,30+ days ago,2021-03-05,About Anomaly Anomaly is a healthcare technol...


## Finding each Label by Indexing
### The data is sort of in order by the keyword I used while scraping

- 1st: 'data scientist'
- 2nd: 'data analyst'
- 3rd: 'data engineer'
- 4th: 'machine learning engineer'

*I scraped about the same amount of jobs for each of the 4 labels. So I'm going to index the dataframe for the range within each quartile because I know that the job titles will change around that point so I'll just differentiate the labels by peeking into the DataFrame from the reference points.*

In [11]:
print(f'1st quarter: {898*.25} \n 2nd quarter: {898*.5} \n 3rd quarter: {898*.75}')

1st quarter: 224.5 
 2nd quarter: 449.0 
 3rd quarter: 673.5


In [12]:
# the last data scientist entry was at 223
#slice1 = slice(0,224)
#(df[:224] == df[slice1]).sum()

In [13]:
# the last data analyst entry was at 447
# df[224:447]

In [14]:
# the last data engineer entry was at 676
# df[447:674]

In [15]:
# the rest of the jobs are machine learning engineers
#df[674:]

In [16]:
def create_labels(df, first=slice(0,224), second=slice(224,447), third=slice(447,674), last=slice(674,902)):
    '''
    This function takes in a DataFrame and creates labels based on
    the index given as as second parameter. Note: this assumes your
    labels have been presorted and you know the index ranges for each.
    ''' 
    df['label'] = np.where(df.iloc[first], 'Data Scientist', '')
    df['label'] = np.where(df.iloc[second], 'Data Analyst', '')
    df['label'] = np.where(df.iloc[third], 'Data Engineer', '')
    df['label'] = np.where(df.iloc[last], 'Machine Learning Engineer', '')
    return df

In [17]:
df.head()

Unnamed: 0,job_title,company,location,is_remote,salary,post_date,date_accessed,job_description
0,Data Entry,TransPerfect Translations Inc.,Remote,1,,6 days ago,2021-03-05,We are looking for Data Entry Clerks to join o...
1,Data Scientist,ForMotiv,Remote,1,"$75,000 - $120,000 a year",30+ days ago,2021-03-05,Has it ever occurred to you that as the Intern...
2,Data Scientist,Redzara.com,Remote,1,$35 - $80 an hour,10 days ago,2021-03-05,Only GC / EAD only. No C2CBackground screening...
3,Data Scientist,Nova Collective,Remote,1,$35 - $48 an hour,24 days ago,2021-03-05,Are you a data scientist who is really excited...
4,Early Career Data Scientist - Applied Math,Pacific Northwest National Laboratory,"Seattle, WA",0,,1 day ago,2021-03-05,Organization and Job ID Job ID: 311747 Directo...


In [18]:
#df.insert(8, 'label', 'Data Scientist', allow_duplicates=True)

In [None]:
r='\bData\b.*\bScientist\b'