### Import relevant libraries

In [1]:
import pandas as pd
import numpy as np

### Import data and reorder columns

In [2]:
# Read in df
df = pd.read_csv('adecco.csv')

# Reorder columns
lst_col = ['job_name', 'category', 'job_type', 'salary','location', 'first_paragraph',
           'description', 'ref_number', 'scrape_datetime']
df = df[lst_col]

df.head(3)

Unnamed: 0,job_name,category,job_type,salary,location,first_paragraph,description,ref_number,scrape_datetime
0,Packaging Associate,Warehouse - Pick / Pack,Contract/Temporary,$ 15.5 - $ 17.05 / Hour,"Lincoln, Nebraska","Adecco is currently assisting a local client, ...","Adecco is currently assisting a local client, ...",US_EN_99_021067_1386145,2020-09-05 17:26:47.378248
1,LT Assembly Specialist,Industrial & Manufacturing - Assembly / Produc...,Contract/Temporary,$17.00/Hour,"Santa Clara, California","At Adecco, we are the workforce experts provid...","At Adecco, we are the workforce experts provid...",US_EN_99_024068_1386183,2020-09-05 17:26:47.899123
2,Production Ad,Industrial & Manufacturing - Assembly / Produc...,Contract/Temporary,$ 11 - $ 11.25 / Day,"South Charleston, West Virginia",Adecco is currently hiring Production Operator...,Adecco is currently hiring Production Operator...,US_EN_99_023229_1293942,2020-09-05 17:26:48.458884


### Text Manipulation

In [3]:
# Replace the parse_error with NA
df = df.replace('parse_error', np.nan).replace(' ', np.nan).replace('', np.nan)

# Remove non-needed characters from salary
def salary_formatter(s):
    return ''.join([i for i in str(s) if i.isnumeric() or i in ['.', '-'] ])
df['salary'] = df['salary'].apply(salary_formatter)

# Make new columns for low and high end salary 
df_salary_range = df["salary"].str.split("-", n = 1, expand = True)
df.insert(4, 'salary_low', df_salary_range[0])
df['salary_low'] = df['salary_low'].replace('', np.nan).astype(float)
df.insert(5, 'salary_high', df_salary_range[1])
df['salary_high'] = df['salary_high'].replace('', np.nan).astype(float)

# Recalculate salary
df['temp_code'] = df['salary_low'].isnull().map({True: 0, False: 1})
df['temp_code'] = df['temp_code'] + df['salary_high'].isnull().map({True: 0, False: 1})
df['salary'] = np.where(df['temp_code']==1, df['salary_low'], (df['salary_low'] + df['salary_high']) / 2)
del df['temp_code']
  
# Make new columns for city and state 
df_location = df["location"].str.split(",", n = 1, expand = True)
df.insert(7, 'city', df_location[0])
df['city'] = df['city'].replace('', np.nan)
df.insert(8, 'state', df_location[1])
df['state'] = df['state'].replace('', np.nan)
del df['location']

# Convert datetime
df['scrape_datetime'] = pd.to_datetime(df['scrape_datetime'], infer_datetime_format=True)  

df.head(3)

Unnamed: 0,job_name,category,job_type,salary,salary_low,salary_high,city,state,first_paragraph,description,ref_number,scrape_datetime
0,Packaging Associate,Warehouse - Pick / Pack,Contract/Temporary,16.275,15.5,17.05,Lincoln,Nebraska,"Adecco is currently assisting a local client, ...","Adecco is currently assisting a local client, ...",US_EN_99_021067_1386145,2020-09-05 17:26:47.378248
1,LT Assembly Specialist,Industrial & Manufacturing - Assembly / Produc...,Contract/Temporary,17.0,17.0,,Santa Clara,California,"At Adecco, we are the workforce experts provid...","At Adecco, we are the workforce experts provid...",US_EN_99_024068_1386183,2020-09-05 17:26:47.899123
2,Production Ad,Industrial & Manufacturing - Assembly / Produc...,Contract/Temporary,11.125,11.0,11.25,South Charleston,West Virginia,Adecco is currently hiring Production Operator...,Adecco is currently hiring Production Operator...,US_EN_99_023229_1293942,2020-09-05 17:26:48.458884


### Write to csv

In [4]:
df.to_csv('adecco_post_processed.csv', index=False)