In [1]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from datetime import datetime, date, timedelta

plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 17

import seaborn as sns
import re
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
sns.set(style="ticks", color_codes=True)

In [2]:
## DATA
# Import/Load
data = pd.read_csv('../app/data/total.csv')

    The following table is a view of the raw data from the scrape. I'll build the target by reducing the pay column to floating point values; this will require text cleaning which will also be conducted for the rest of the table. Location will be split into new columns for City and State while and because Indeed's search results show dates relative to the date the query was made I'll need to use PostDate and ExtractDate build a DatePosted column. It is important to note that JobUrl and date-related columns will not be converted into features for the logistic regression, instead they will be reattached to the data at the end in order to provide additional information and insights through the EC2 application. The only rows that'l  be using for modelling are JobTitle, Company, Summary, Requirements, and Description.

In [3]:
#  Examine
data.head(5)

Unnamed: 0,company,description,estimated_salary,extractDate,job_type_items,location,postDate,rating,raw_desc_soup,requirements,sal_guide_items,salary,salary_and_jType,salfromsection,summary,title,url
0,Online Technical Services,"Job descriptionData Scientist, MarketingSAN DI...",,2022-04-13,['Full-time'],Remote,PostedJust posted,3.7,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",Master's (Preferred)Python: 1 year (Preferred)...,,"$145,000 - $150,000 a year","$145,000 - $150,000 a year - Full-time","$145,000 - $150,000 a year",Identify relevant data sources and data sets t...,Data Scientist - Marketing,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,West CAP,HUMAN was founded in 2012 in a Brooklyn sci-fi...,Estimated $114K – $144K a year,2022-04-13,,"Remote in New York, NY+2 locations",PostedJust posted,3.5,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$114K to $14...",,Full-time,,You’ve worked as a data scientist solving larg...,"Data Scientist, BotGuard",https://www.indeed.com/rc/clk?jk=58cdde046f643...
2,Maya Ai inc.,Our Maya team is expanding and we are looking ...,,2022-04-13,"['Full-time', 'Part-time']",Remote,PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",Python: 1 year (Preferred)SQL: 1 year (Preferred),,"$77,766 - $183,411 a year","$77,766 - $183,411 a year - Full-time, Part-time","$77,766 - $183,411 a year",Our Analyst will be dealing with data coming i...,Data Scientist,https://www.indeed.com/company/Maya-Ai-inc./jo...
3,"EMERGETECH, INC",Description:Job CategoryData ScienceAbout Emer...,Estimated $94.7K – $120K a year,2022-04-13,,Remote,PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$94.7K to $1...",,,,Design and create the data sources that ”citiz...,Data Scientist,https://www.indeed.com/rc/clk?jk=95fb128bb025f...
4,Recurrent,What's the opportunity?Recurrent is on a missi...,Estimated $119K – $151K a year,2022-04-13,,"Remote in Seattle, WA",PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$119K to $15...",,,,Experienced - you have 2+ years of experience ...,Data Scientist,https://www.indeed.com/rc/clk?jk=e9ce610b72deb...


In [4]:
data[data.location == 'Remote'].head(5)

Unnamed: 0,company,description,estimated_salary,extractDate,job_type_items,location,postDate,rating,raw_desc_soup,requirements,sal_guide_items,salary,salary_and_jType,salfromsection,summary,title,url
0,Online Technical Services,"Job descriptionData Scientist, MarketingSAN DI...",,2022-04-13,['Full-time'],Remote,PostedJust posted,3.7,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",Master's (Preferred)Python: 1 year (Preferred)...,,"$145,000 - $150,000 a year","$145,000 - $150,000 a year - Full-time","$145,000 - $150,000 a year",Identify relevant data sources and data sets t...,Data Scientist - Marketing,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2,Maya Ai inc.,Our Maya team is expanding and we are looking ...,,2022-04-13,"['Full-time', 'Part-time']",Remote,PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",Python: 1 year (Preferred)SQL: 1 year (Preferred),,"$77,766 - $183,411 a year","$77,766 - $183,411 a year - Full-time, Part-time","$77,766 - $183,411 a year",Our Analyst will be dealing with data coming i...,Data Scientist,https://www.indeed.com/company/Maya-Ai-inc./jo...
3,"EMERGETECH, INC",Description:Job CategoryData ScienceAbout Emer...,Estimated $94.7K – $120K a year,2022-04-13,,Remote,PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$94.7K to $1...",,,,Design and create the data sources that ”citiz...,Data Scientist,https://www.indeed.com/rc/clk?jk=95fb128bb025f...
6,TechTrueUP,Description: DATA SCIENTIST*** Fully remote***...,,2022-04-13,"['Full-time', 'Contract']",Remote,PostedJust posted,3.8,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",Bachelor's (Required)Python: 1 year (Required)...,,$60 - $70 an hour,"$60 - $70 an hour - Full-time, Contract",$60 - $70 an hour,DSC is seeking a Data Scientist who would be f...,MCS Data Scientist,https://www.indeed.com/company/TechTrueUP/jobs...
9,Redfin,This position is a remote eligible position...,,2022-04-13,['Full-time'],Remote,PostedJust posted,3.4,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,"$87,600 - $131,400 a year","$87,600 - $131,400 a year - Full-time","$87,600 - $131,400 a year","Experience in analytics of operations, specifi...",Senior Data Analyst - Tour Support (Remote Eli...,https://www.indeed.com/rc/clk?jk=25a64fbe8a600...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   company           660 non-null    object 
 1   description       660 non-null    object 
 2   estimated_salary  286 non-null    object 
 3   extractDate       660 non-null    object 
 4   job_type_items    188 non-null    object 
 5   location          660 non-null    object 
 6   postDate          660 non-null    object 
 7   rating            388 non-null    float64
 8   raw_desc_soup     660 non-null    object 
 9   requirements      109 non-null    object 
 10  sal_guide_items   318 non-null    object 
 11  salary            198 non-null    object 
 12  salary_and_jType  483 non-null    object 
 13  salfromsection    197 non-null    object 
 14  summary           660 non-null    object 
 15  title             660 non-null    object 
 16  url               660 non-null    object 
dt

In [6]:
# Missing Values?
missing_values = data.isnull().sum()
print(missing_values)

company               0
description           0
estimated_salary    374
extractDate           0
job_type_items      472
location              0
postDate              0
rating              272
raw_desc_soup         0
requirements        551
sal_guide_items     342
salary              462
salary_and_jType    177
salfromsection      463
summary               0
title                 0
url                   0
dtype: int64


In [7]:
data[data.isnull().any(axis=1)]
# every column has nulls!

Unnamed: 0,company,description,estimated_salary,extractDate,job_type_items,location,postDate,rating,raw_desc_soup,requirements,sal_guide_items,salary,salary_and_jType,salfromsection,summary,title,url
0,Online Technical Services,"Job descriptionData Scientist, MarketingSAN DI...",,2022-04-13,['Full-time'],Remote,PostedJust posted,3.7,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",Master's (Preferred)Python: 1 year (Preferred)...,,"$145,000 - $150,000 a year","$145,000 - $150,000 a year - Full-time","$145,000 - $150,000 a year",Identify relevant data sources and data sets t...,Data Scientist - Marketing,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,West CAP,HUMAN was founded in 2012 in a Brooklyn sci-fi...,Estimated $114K – $144K a year,2022-04-13,,"Remote in New York, NY+2 locations",PostedJust posted,3.5,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$114K to $14...",,Full-time,,You’ve worked as a data scientist solving larg...,"Data Scientist, BotGuard",https://www.indeed.com/rc/clk?jk=58cdde046f643...
2,Maya Ai inc.,Our Maya team is expanding and we are looking ...,,2022-04-13,"['Full-time', 'Part-time']",Remote,PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",Python: 1 year (Preferred)SQL: 1 year (Preferred),,"$77,766 - $183,411 a year","$77,766 - $183,411 a year - Full-time, Part-time","$77,766 - $183,411 a year",Our Analyst will be dealing with data coming i...,Data Scientist,https://www.indeed.com/company/Maya-Ai-inc./jo...
3,"EMERGETECH, INC",Description:Job CategoryData ScienceAbout Emer...,Estimated $94.7K – $120K a year,2022-04-13,,Remote,PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$94.7K to $1...",,,,Design and create the data sources that ”citiz...,Data Scientist,https://www.indeed.com/rc/clk?jk=95fb128bb025f...
4,Recurrent,What's the opportunity?Recurrent is on a missi...,Estimated $119K – $151K a year,2022-04-13,,"Remote in Seattle, WA",PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$119K to $15...",,,,Experienced - you have 2+ years of experience ...,Data Scientist,https://www.indeed.com/rc/clk?jk=e9ce610b72deb...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,California FAIR Plan Association,Position SummaryThe statistical reporting anal...,Estimated $73K - $92.4K a year,2022-04-15,,"Remote in Los Angeles, CA 90010",PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$73K to $92....",,Full-time,,Ability to understand data information systems...,Statistical Reporting Analyst (Remote),https://www.indeed.com/company/California-FAIR...
656,Optum,Combine two of the fastest-growing fields on t...,,2022-04-15,,"Remote in Eden Prairie, MN 55346+1 location",PostedJust posted,3.4,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,,Full-time,,"Troubleshoots data integrity issues, analyzes ...",Senior Data Analyst - Telecommute,https://www.indeed.com/rc/clk?jk=b164952160a87...
657,Collins Aerospace,This position is for an established Machine Le...,,2022-04-15,,"Remote in Cedar Rapids, IA+1 location",PostedJust posted,3.6,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,,Full-time,,"Transforms analytics, models, and other protot...",Machine Learning Engineer (Remote),https://www.indeed.com/rc/clk?jk=ca3746ffdf53f...
658,Manifold AI,"As a machine learning engineer, a large portio...",Estimated $133K - $169K a year,2022-04-15,,"Remote in Boston, MA",PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,,,,"As a machine learning engineer, a large portio...",Machine Learning Engineer,https://www.indeed.com/rc/clk?jk=eb1780fbeea78...


In [8]:
#  Value Counts
data.nunique()

company              57
description          62
estimated_salary     52
extractDate           2
job_type_items        6
location             39
postDate              2
rating               15
raw_desc_soup       660
requirements          8
sal_guide_items      48
salary               15
salary_and_jType     17
salfromsection       15
summary              87
title                50
url                 121
dtype: int64

In [9]:
data.company.unique()

array(['Online Technical Services', 'West CAP', 'Maya Ai inc.',
       'EMERGETECH, INC', 'Recurrent', 'Fractal.ai', 'TechTrueUP', 'EAB',
       'Ford Motor Company', 'Redfin', 'SparkCognition', 'CoderPad, Inc.',
       'Humana', 'Amadeus', 'The American College of Radiology',
       'Insight Global', 'IBM', 'CyberCoders', 'Calculated Hire',
       'CompuGain', 'Liberty Mutual Insurance', 'Comcentric', 'Switchfly',
       'Acrisure LLC', 'Myticas Consulting', 'Serco North America',
       'Textio', 'Cash App', 'LOCKHEED MARTIN CORPORATION', 'Fjuri',
       'KEY CAPTURE ENERGY', 'The Mom Project', 'SureStart',
       'Embark Trucks', 'Engage3', 'Pfizer', 'Luma Health',
       'Centene Corporation', 'Common Computer Inc.', 'Shortcut',
       'Centurion', 'EDWARD JONES', 'Vital Connect, Inc.', 'Updater',
       'Cambium Assessment', 'Optum', 'Opal', 'Home Depot / THD',
       'SCRUFF', 'Manifold AI', 'California FAIR Plan Association',
       'MultiPlan Inc.', 'Aledade', 'UnitedHealth Gro

In [10]:
interesting = ['company','estimated_salary','job_type_items','location','rating','requirements','salary','sal_guide_items','salary_and_jType']
for i in interesting:
    print(data[i].unique())

['Online Technical Services' 'West CAP' 'Maya Ai inc.' 'EMERGETECH, INC'
 'Recurrent' 'Fractal.ai' 'TechTrueUP' 'EAB' 'Ford Motor Company' 'Redfin'
 'SparkCognition' 'CoderPad, Inc.' 'Humana' 'Amadeus'
 'The American College of Radiology' 'Insight Global' 'IBM' 'CyberCoders'
 'Calculated Hire' 'CompuGain' 'Liberty Mutual Insurance' 'Comcentric'
 'Switchfly' 'Acrisure LLC' 'Myticas Consulting' 'Serco North America'
 'Textio' 'Cash App' 'LOCKHEED MARTIN CORPORATION' 'Fjuri'
 'KEY CAPTURE ENERGY' 'The Mom Project' 'SureStart' 'Embark Trucks'
 'Engage3' 'Pfizer' 'Luma Health' 'Centene Corporation'
 'Common Computer Inc.' 'Shortcut' 'Centurion' 'EDWARD JONES'
 'Vital Connect, Inc.' 'Updater' 'Cambium Assessment' 'Optum' 'Opal'
 'Home Depot / THD' 'SCRUFF' 'Manifold AI'
 'California FAIR Plan Association' 'MultiPlan Inc.' 'Aledade'
 'UnitedHealth Group' 'Lumen' 'PwC' 'Collins Aerospace']
[nan 'Estimated $114K – $144K a year' 'Estimated $94.7K – $120K a year'
 'Estimated $119K – $151K a year'

In [11]:
samples = []
for i in np.unique(data[data['company'].notnull()].company):
    samples.append(i)
print(len(samples))
samples[:]

57


['Acrisure LLC',
 'Aledade',
 'Amadeus',
 'Calculated Hire',
 'California FAIR Plan Association',
 'Cambium Assessment',
 'Cash App',
 'Centene Corporation',
 'Centurion',
 'CoderPad, Inc.',
 'Collins Aerospace',
 'Comcentric',
 'Common Computer Inc.',
 'CompuGain',
 'CyberCoders',
 'EAB',
 'EDWARD JONES',
 'EMERGETECH, INC',
 'Embark Trucks',
 'Engage3',
 'Fjuri',
 'Ford Motor Company',
 'Fractal.ai',
 'Home Depot / THD',
 'Humana',
 'IBM',
 'Insight Global',
 'KEY CAPTURE ENERGY',
 'LOCKHEED MARTIN CORPORATION',
 'Liberty Mutual Insurance',
 'Luma Health',
 'Lumen',
 'Manifold AI',
 'Maya Ai inc.',
 'MultiPlan Inc.',
 'Myticas Consulting',
 'Online Technical Services',
 'Opal',
 'Optum',
 'Pfizer',
 'PwC',
 'Recurrent',
 'Redfin',
 'SCRUFF',
 'Serco North America',
 'Shortcut',
 'SparkCognition',
 'SureStart',
 'Switchfly',
 'TechTrueUP',
 'Textio',
 'The American College of Radiology',
 'The Mom Project',
 'UnitedHealth Group',
 'Updater',
 'Vital Connect, Inc.',
 'West CAP']

In [12]:
null = sum(data.salary.isnull())
nnull = sum(data.salary.notnull())
print(data.isnull().sum())
print('')
print(f'The data contains {data.shape[0]} rows of individual job postings with values for {data.shape[1]} columns based on\n different sections of each post\'s web page.')
print('')
print(f'- Out of {len(data)} job postings {nnull} or {round(nnull/len(data)*100,2)}% include some sort of salary information,\n- The remaining {null} rowws, or {round(null/len(data)*100,2)}% are missing salary data.\n- Also, there are no null values in the rest of the table meaning we won\'t need\n  to lose any data by dropping rows. While the null values in the Pay column will\n  simply be dropped after we translate the not nulls into the target.')

company               0
description           0
estimated_salary    374
extractDate           0
job_type_items      472
location              0
postDate              0
rating              272
raw_desc_soup         0
requirements        551
sal_guide_items     342
salary              462
salary_and_jType    177
salfromsection      463
summary               0
title                 0
url                   0
dtype: int64

The data contains 660 rows of individual job postings with values for 17 columns based on
 different sections of each post's web page.

- Out of 660 job postings 198 or 30.0% include some sort of salary information,
- The remaining 462 rowws, or 70.0% are missing salary data.
- Also, there are no null values in the rest of the table meaning we won't need
  to lose any data by dropping rows. While the null values in the Pay column will
  simply be dropped after we translate the not nulls into the target.


In [13]:
print('Here we can observe the sort of values held in the Pay column. Aside from removing special characters and spaces I\'ll extract\n the remaing non-numeric chars into a new column that\'ll allow me to build the annual Salary column once the remaining\n numeric string are converted to floating point.')

samples = []
for i in np.unique(data[data['sal_guide_items'].notnull()].sal_guide_items):
    samples.append(i)
samples[:3]

Here we can observe the sort of values held in the Pay column. Aside from removing special characters and spaces I'll extract
 the remaing non-numeric chars into a new column that'll allow me to build the annual Salary column once the remaining
 numeric string are converted to floating point.


['[\'\', \'Not provided by employer\', "$101K - $128K a year is Indeed\'s estimated salary for this role in San Francisco, CA."]',
 '[\'\', \'Not provided by employer\', "$101K to $128K per year is Indeed\'s estimated salary for data scientist in San Francisco, CA."]',
 '[\'\', \'Not provided by employer\', "$110K - $139K a year is Indeed\'s estimated salary for this role in San Francisco, CA."]']

In [54]:
## Preprocess Data
data = pd.read_csv('../app/data/total.csv')
#  drop_dupes
data = data.drop_duplicates()
data.columns = map(str.lower, data.columns)
data = data.rename({'title': 'job_title'}, axis=1)

In [55]:
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

In [56]:
def sal_chars(data):
    cleaned = re.sub(r'[\n|,|+|$|[|$|\'|"]',r'',data)
    return cleaned

def sal_splitter(data):
    x = data.split(',')
    return x[2]



In [57]:
salary = data[data.salary.notnull()]
data[['salary',]] = salary['salary'].apply(sal_chars)

estimated_salary = data[data.estimated_salary.notnull()]
data['estimated_salary'] = estimated_salary['estimated_salary'].apply(sal_chars)

salfromsection = data[data.salfromsection.notnull()]
data[['salfromsection',]] = salfromsection['salfromsection'].apply(sal_chars)

salary_and_jtype = data[data.salfromsection.notnull()]
data[['salary_and_jtype',]] = salary_and_jtype['salary_and_jtype'].apply(sal_chars)




sal_guide_arr = data[data.sal_guide_items.notnull()]
data[['sal_guide_items']] = sal_guide_arr['sal_guide_items'].apply(sal_splitter)

sal_guide_arr = data[data.sal_guide_items.notnull()]
data[['sal_guide_items']] = sal_guide_arr['sal_guide_items'].apply(sal_chars)


In [58]:
def Pay_period(data):
    z = ''
    substrings = ['hour','day','week','year']
    for i in substrings:
        if i in data:
            return i
        else:
            continue

estimated_salary = data[data['estimated_salary'].notnull()]
data['Schedule1'] = estimated_salary['estimated_salary'].apply(Pay_period)

sal_guide_items = data[data['sal_guide_items'].notnull()]
data['Schedule2'] = sal_guide_items['sal_guide_items'].apply(Pay_period)

salary = data[data['salary'].notnull()]
data['Schedule3'] = salary['salary'].apply(Pay_period)

salary_and_jtype = data[data['salary_and_jtype'].notnull()]
data['Schedule4'] = salary_and_jtype['salary_and_jtype'].apply(Pay_period)

salfromsection = data[data['salfromsection'].notnull()]
data['Schedule5'] = salfromsection['salfromsection'].apply(Pay_period)

In [59]:
val_cols = ['Schedule1','Schedule2','Schedule3','Schedule4','Schedule5']
data['schedule'] = data[val_cols].bfill(axis=1).iloc[:, 0]
data.drop(val_cols,inplace=True,axis=1)

In [60]:
data.schedule.unique()

array(['year', 'hour', nan, None], dtype=object)

In [61]:
col = ['estimated_salary','sal_guide_items','salary','salary_and_jtype','salfromsection']
for sal in col:
    data[sal].replace('[^\d\-]','',regex=True, inplace = True)

In [62]:
def split_sal(i):
    try:
        lst = i.split('-',1)
        x = lst[0]
        y = lst[1]
        
        y = re.sub(r'[-]',r'',y)

        return (float(x)+float(y))//2
    except:
        return i

for sal in col:
    data[sal] = data[sal].apply(lambda x:split_sal(x))

In [63]:
data.rename(columns={'salary': 'sal'},inplace=True)

In [64]:
col = ['estimated_salary','sal_guide_items','sal','salary_and_jtype','salfromsection']

data['salary'] = data[col].bfill(axis=1).iloc[:, 0]
data.drop(col,inplace=True,axis=1)

In [65]:
data['salary'] = pd.to_numeric(data['salary'])

In [66]:

cols = ['company','description', 'job_type_items','location','postdate','requirements',	'summary','job_title']
for txt_col in cols:
    data[txt_col] = data[txt_col].str.lower()

In [67]:
def jobtype(data):
    if type(data) == str:
        if re.search('full', data):
            out = 'full'
        if re.search('part', data):
            out = 'part'
        if re.search('contract', data):
            out = 'contract'
        if re.search('temp', data):
            out = 'temp'
        return out
    else:
        pass
    

data['jobtype'] = data.job_type_items.apply(jobtype)


In [68]:
data['text'] = data.description+' '+data.summary
#TODO add requirements

In [69]:
cols = ['job_type_items','raw_desc_soup','url','description','summary','requirements']
data.drop(cols,inplace=True,axis=1)

In [70]:
data.text = data.text.replace(')',' ')

In [71]:

def annual(data):
        data['annual_sal'] = np.nan
        data['annual_sal'] = np.where(data['schedule'].str.contains("hour"), data.salary*365/7*40, data['salary'])
        #data['annual_sal'] = np.where(data['schedule'].str.contains("day"), data['salary']*365/7*5, data['salary'])
        #data['annual_sal'] = np.where(data['schedule'].str.contains("week"), data['salary']*365/7, data['salary'])
        #data['annual_sal'] = np.where(data['schedule'].str.contains("month"), data['salary']*365/12, data['salary'])
        #data['annual_sal'] = np.where(data['schedule'].str.contains("year"), data['salary'], data['salary'])
        return data
data = annual(data)
#data.schedule.value_counts() keep and eye on this, may have to deal with more than just hourly

In [72]:
data

Unnamed: 0,company,extractdate,location,postdate,rating,job_title,schedule,salary,jobtype,text,annual_sal
0,online technical services,2022-04-13,remote,postedjust posted,3.7,data scientist - marketing,year,147500.0,full,"job descriptiondata scientist, marketingsan di...",147500.0
1,west cap,2022-04-13,"remote in new york, ny+2 locations",postedjust posted,3.5,"data scientist, botguard",year,114144.0,,human was founded in 2012 in a brooklyn sci-fi...,114144.0
2,maya ai inc.,2022-04-13,remote,postedjust posted,,data scientist,year,130588.0,part,our maya team is expanding and we are looking ...,130588.0
3,"emergetech, inc",2022-04-13,remote,postedjust posted,,data scientist,year,947120.0,,description:job categorydata scienceabout emer...,947120.0
4,recurrent,2022-04-13,"remote in seattle, wa",postedjust posted,,data scientist,year,119151.0,,what's the opportunity?recurrent is on a missi...,119151.0
...,...,...,...,...,...,...,...,...,...,...,...
655,california fair plan association,2022-04-15,"remote in los angeles, ca 90010",postedjust posted,,statistical reporting analyst (remote),year,498.0,,position summarythe statistical reporting anal...,498.0
656,optum,2022-04-15,"remote in eden prairie, mn 55346+1 location",postedjust posted,3.4,senior data analyst - telecommute,,,,combine two of the fastest-growing fields on t...,
657,collins aerospace,2022-04-15,"remote in cedar rapids, ia+1 location",postedjust posted,3.6,machine learning engineer (remote),,,,this position is for an established machine le...,
658,manifold ai,2022-04-15,"remote in boston, ma",postedjust posted,,machine learning engineer,year,151.0,,"as a machine learning engineer, a large portio...",151.0


In [92]:
def postD(data):
    if data != np.nan:
        rid = ['active',' day ago','%+ days ago','+']
        for i in rid:
            if i in data:
                return np.nan
        
        rid2 = ['just posted','today','postedjust posted','postedtoday']
        for i in rid2:
            if i in data:
                return 0   


    #data['dateposted'] = data['dateposted'].astype('int')
    return data



data['dateposted'] = data.postdate.apply(postD)

In [93]:
data

Unnamed: 0,company,extractdate,location,postdate,rating,job_title,schedule,salary,jobtype,text,annual_sal,dateposted
0,online technical services,2022-04-13,remote,postedjust posted,3.7,data scientist - marketing,year,147500.0,full,"job descriptiondata scientist, marketingsan di...",147500.0,0
1,west cap,2022-04-13,"remote in new york, ny+2 locations",postedjust posted,3.5,"data scientist, botguard",year,114144.0,,human was founded in 2012 in a brooklyn sci-fi...,114144.0,0
2,maya ai inc.,2022-04-13,remote,postedjust posted,,data scientist,year,130588.0,part,our maya team is expanding and we are looking ...,130588.0,0
3,"emergetech, inc",2022-04-13,remote,postedjust posted,,data scientist,year,947120.0,,description:job categorydata scienceabout emer...,947120.0,0
4,recurrent,2022-04-13,"remote in seattle, wa",postedjust posted,,data scientist,year,119151.0,,what's the opportunity?recurrent is on a missi...,119151.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
655,california fair plan association,2022-04-15,"remote in los angeles, ca 90010",postedjust posted,,statistical reporting analyst (remote),year,498.0,,position summarythe statistical reporting anal...,498.0,0
656,optum,2022-04-15,"remote in eden prairie, mn 55346+1 location",postedjust posted,3.4,senior data analyst - telecommute,,,,combine two of the fastest-growing fields on t...,,0
657,collins aerospace,2022-04-15,"remote in cedar rapids, ia+1 location",postedjust posted,3.6,machine learning engineer (remote),,,,this position is for an established machine le...,,0
658,manifold ai,2022-04-15,"remote in boston, ma",postedjust posted,,machine learning engineer,year,151.0,,"as a machine learning engineer, a large portio...",151.0,0


In [94]:
data.dateposted.unique()

array([0])

In [95]:
data['extractdate']= pd.to_datetime(data['extractdate'])

In [96]:
def pDate(row):

    days_ago = row['dateposted']
    delta = timedelta(days_ago)
    try:
        return row['extractdate'] - delta
    except:
        return row





In [97]:
data['dateposted'] = data.apply( lambda row : pDate(row), axis = 1)

In [98]:
data.rating = data.rating.fillna(0)

In [99]:
cols = ['extractdate','postdate','schedule','jobtype','salary']
data.drop(cols,inplace=True,axis=1)
data = data.rename({'annual_sal': 'salary'}, axis=1)

In [100]:
data

Unnamed: 0,company,location,rating,job_title,text,salary,dateposted
0,online technical services,remote,3.7,data scientist - marketing,"job descriptiondata scientist, marketingsan di...",147500.0,2022-04-13
1,west cap,"remote in new york, ny+2 locations",3.5,"data scientist, botguard",human was founded in 2012 in a brooklyn sci-fi...,114144.0,2022-04-13
2,maya ai inc.,remote,0.0,data scientist,our maya team is expanding and we are looking ...,130588.0,2022-04-13
3,"emergetech, inc",remote,0.0,data scientist,description:job categorydata scienceabout emer...,947120.0,2022-04-13
4,recurrent,"remote in seattle, wa",0.0,data scientist,what's the opportunity?recurrent is on a missi...,119151.0,2022-04-13
...,...,...,...,...,...,...,...
655,california fair plan association,"remote in los angeles, ca 90010",0.0,statistical reporting analyst (remote),position summarythe statistical reporting anal...,498.0,2022-04-15
656,optum,"remote in eden prairie, mn 55346+1 location",3.4,senior data analyst - telecommute,combine two of the fastest-growing fields on t...,,2022-04-15
657,collins aerospace,"remote in cedar rapids, ia+1 location",3.6,machine learning engineer (remote),this position is for an established machine le...,,2022-04-15
658,manifold ai,"remote in boston, ma",0.0,machine learning engineer,"as a machine learning engineer, a large portio...",151.0,2022-04-15


In [101]:
def sal_fixer(data):
    if type(data) != np.nan:
        data = round(data,0)

        if len(str(data)) <= 5:
            data*=1000
    else:
        pass
    return data

data['salary'] = data['salary'].apply(sal_fixer)


In [102]:
data

Unnamed: 0,company,location,rating,job_title,text,salary,dateposted
0,online technical services,remote,3.7,data scientist - marketing,"job descriptiondata scientist, marketingsan di...",147500.0,2022-04-13
1,west cap,"remote in new york, ny+2 locations",3.5,"data scientist, botguard",human was founded in 2012 in a brooklyn sci-fi...,114144.0,2022-04-13
2,maya ai inc.,remote,0.0,data scientist,our maya team is expanding and we are looking ...,130588.0,2022-04-13
3,"emergetech, inc",remote,0.0,data scientist,description:job categorydata scienceabout emer...,947120.0,2022-04-13
4,recurrent,"remote in seattle, wa",0.0,data scientist,what's the opportunity?recurrent is on a missi...,119151.0,2022-04-13
...,...,...,...,...,...,...,...
655,california fair plan association,"remote in los angeles, ca 90010",0.0,statistical reporting analyst (remote),position summarythe statistical reporting anal...,498000.0,2022-04-15
656,optum,"remote in eden prairie, mn 55346+1 location",3.4,senior data analyst - telecommute,combine two of the fastest-growing fields on t...,,2022-04-15
657,collins aerospace,"remote in cedar rapids, ia+1 location",3.6,machine learning engineer (remote),this position is for an established machine le...,,2022-04-15
658,manifold ai,"remote in boston, ma",0.0,machine learning engineer,"as a machine learning engineer, a large portio...",151000.0,2022-04-15


In [103]:
def states_(data):
    states = {" AL":"Alabama", " AK":"Alaska", " AS":"American Samoa", " AZ":"Arizona", " AR":"Arkansas",
        " CA":"California", " CO":"Colorado", " CT":"Connecticut", " DE":"Delaware", " DC":"District of Columbia",
        " FL":"Florida", " GA":"Georgia", " GU":"Guam ", " HI":"Hawaii", " ID":"Idaho", " IL":"Illinois", " IN":"Indiana",
        " IA":"Iowa", " KS":"Kansas", " KY":"Kentucky", " LA":"Louisiana", " ME":"Maine", " MD":"Maryland", " MA":"Massachusetts",
        " MI":"Michigan", " MN":"Minnesota", " MS":"Mississippi", " MO":"Missouri", " MT":"Montana", " NE":"Nebraska",
        " NV":"Nevada", " NH":"New Hampshire", " NJ":"New Jersey", " NM":"New Mexico", " NY":"New York", " NC":"North Carolina",
        " ND":"North Dakota", " MP":"Northern Mariana Islands", " OH":"Ohio", " OK":"Oklahoma", " OR":"Oregon", " PA":"Pennsylvania",
        " PR":"Puerto Rico", " RI":"Rhode Island", " SC":"South Carolina", " SD":"South Dakota", " TN":"Tennessee",
        " TX":"Texas", " UT":"Utah", " UM":"U.S. Minor Outlying Islands", " VT":"Vermont", " VI":"Virgin Islands", " VA":"Virginia",
        " WA":"Washington", " WV":"West Virginia", " WI":"Wisconsin", " WY":"Wyoming"}
    extras = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District ", "of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]
    for k,v in states.items():
        data['State'] = np.where(data['location'].str.contains(k.lower()), k, data['State'])
        data['State'] = np.where(data['location'].str.contains(v.lower()), k, data['State'])
        data['State'] = np.where(data['location'].str.contains('%remote'), 'remote', data['State'])
        data['State'] = np.where(data['location'].str.contains('united utates'), 'Remote', data['State'])
        return data


def cities_(i):

    if ',' in i:
        return i.partition(',')[0]
    if ',' not in i:
        return i
    elif 'remote' in i:
        return 'remote'

In [104]:
data['State'] = ''
data = states_(data)
data["City"] = data["location"].apply(cities_)

In [105]:
data

Unnamed: 0,company,location,rating,job_title,text,salary,dateposted,State,City
0,online technical services,remote,3.7,data scientist - marketing,"job descriptiondata scientist, marketingsan di...",147500.0,2022-04-13,,remote
1,west cap,"remote in new york, ny+2 locations",3.5,"data scientist, botguard",human was founded in 2012 in a brooklyn sci-fi...,114144.0,2022-04-13,,remote in new york
2,maya ai inc.,remote,0.0,data scientist,our maya team is expanding and we are looking ...,130588.0,2022-04-13,,remote
3,"emergetech, inc",remote,0.0,data scientist,description:job categorydata scienceabout emer...,947120.0,2022-04-13,,remote
4,recurrent,"remote in seattle, wa",0.0,data scientist,what's the opportunity?recurrent is on a missi...,119151.0,2022-04-13,,remote in seattle
...,...,...,...,...,...,...,...,...,...
655,california fair plan association,"remote in los angeles, ca 90010",0.0,statistical reporting analyst (remote),position summarythe statistical reporting anal...,498000.0,2022-04-15,,remote in los angeles
656,optum,"remote in eden prairie, mn 55346+1 location",3.4,senior data analyst - telecommute,combine two of the fastest-growing fields on t...,,2022-04-15,,remote in eden prairie
657,collins aerospace,"remote in cedar rapids, ia+1 location",3.6,machine learning engineer (remote),this position is for an established machine le...,,2022-04-15,,remote in cedar rapids
658,manifold ai,"remote in boston, ma",0.0,machine learning engineer,"as a machine learning engineer, a large portio...",151000.0,2022-04-15,,remote in boston


In [106]:
data.City.unique()

array(['remote', 'remote in new york', 'remote in seattle',
       'remote in washington', 'remote in dearborn', 'remote in austin',
       'remote in louisville', '+1 locationremote',
       'remote in san francisco', 'remote in pittsburgh',
       '+21 locationsremote', 'remote in denver', 'remote in arlington',
       'remote in stratford', 'remote in houston', 'remote in charlotte',
       'remote in chicago', 'remote in vienna', 'remote in rolla',
       'remote in eden prairie', 'remote in brooklyn', 'remote in boston',
       'remote in los angeles', 'remote in bethesda',
       'remote in minnetonka', 'remote in cedar rapids'], dtype=object)

In [107]:
data.City.value_counts()

remote                     281
remote in austin            64
remote in san francisco     41
remote in new york          36
remote in seattle           29
remote in washington        23
remote in dearborn          22
+21 locationsremote         20
remote in louisville        19
remote in los angeles       17
remote in eden prairie      16
remote in arlington         14
remote in houston           14
remote in denver            14
remote in bethesda           8
remote in boston             8
remote in brooklyn           7
remote in chicago            6
remote in minnetonka         6
remote in stratford          4
+1 locationremote            4
remote in rolla              2
remote in charlotte          2
remote in pittsburgh         1
remote in vienna             1
remote in cedar rapids       1
Name: City, dtype: int64

In [None]:
data.describe(include = 'all')
## EDA: Initial Summary Statistics

## Visualization: Summary Statistics

#  Normailze?
## EDA: Feature Importance/Selection
# Convariance: Strength of Linear Relationships
## Visualization: Feature Relations
## Data Wrangling
## Modelling
# #Fit/Train Model
# Evaluate Model
# Visualization: Communicating Findings

In [None]:



def munge():
    """[Cleaning and Featuring Engineering]

    Args:
        position ([string]): [identifier for scraped dataset]

    Returns:
        [csv]: [data prepared for ML]
    """
    # Using the position variable to select/process scraped data based the on the query that
    # generated it.
    data = pd.read_csv('../app/data/total.csv', index_col=1)
    data = data[data.Pay != '30+ days ago']
    data = data[data.Pay != '$120 per student']

    #TODO just name is salary in scrape.py

    

    def states_(data):
        """[Builds 'State' feature by splitting 'Location']

        Args:
            i ([row of data]): ['Location' value]

        Returns:
            [string]: [state name abbrieviation]
        """
        states = {" AL":"Alabama", " AK":"Alaska", " AS":"American Samoa", " AZ":"Arizona", " AR":"Arkansas",
        " CA":"California", " CO":"Colorado", " CT":"Connecticut", " DE":"Delaware", " DC":"District of Columbia",
        " FL":"Florida", " GA":"Georgia", " GU":"Guam ", " HI":"Hawaii", " ID":"Idaho", " IL":"Illinois", " IN":"Indiana",
        " IA":"Iowa", " KS":"Kansas", " KY":"Kentucky", " LA":"Louisiana", " ME":"Maine", " MD":"Maryland", " MA":"Massachusetts",
        " MI":"Michigan", " MN":"Minnesota", " MS":"Mississippi", " MO":"Missouri", " MT":"Montana", " NE":"Nebraska",
        " NV":"Nevada", " NH":"New Hampshire", " NJ":"New Jersey", " NM":"New Mexico", " NY":"New York", " NC":"North Carolina",
        " ND":"North Dakota", " MP":"Northern Mariana Islands", " OH":"Ohio", " OK":"Oklahoma", " OR":"Oregon", " PA":"Pennsylvania",
        " PR":"Puerto Rico", " RI":"Rhode Island", " SC":"South Carolina", " SD":"South Dakota", " TN":"Tennessee",
        " TX":"Texas", " UT":"Utah", " UM":"U.S. Minor Outlying Islands", " VT":"Vermont", " VI":"Virgin Islands", " VA":"Virginia",
        " WA":"Washington", " WV":"West Virginia", " WI":"Wisconsin", " WY":"Wyoming"}
        extras = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District ", "of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]
        for k,v in states.items():
            data['State'] = np.where(data['Location'].str.contains(k), k, data['State'])
            data['State'] = np.where(data['Location'].str.contains(v), k, data['State'])
            data['State'] = np.where(data['Location'].str.contains('Remote'), 'Remote', data['State'])
            data['State'] = np.where(data['Location'].str.contains('United States'), 'Remote', data['State'])
        return data


    def cities_(i):
        """[Builds 'City' feature by splitting 'Location']

        Args:
            i ([row of data]): ['Location' value]

        Returns:
            [string]: [city name from 'Location']
        """
        if ',' in i:
            return i.partition(',')[0]
        if ',' not in i:
            return i
        elif 'Remote' in i:
            return 'Remote'



    def count_dupes(data):
        """[Small helper for quick data integrity check]

        Args:
            data ([DataFrame]): [scrapped data]

        Returns:
            [tuple]: [number of duplicates (if any), number of unique values]
        """
        dupe = 0
        uniq = 0
        for i in data:
            if i == True:
                dupe += 1
            else:
                uniq +=1
        return dupe, uniq


    def deduper(data):
        """[Dropes duplicates]

        Args:
            data ([DataFrame]): [in munging]

        Returns:
            [DataFrame]: [Now without dupes]
        """
        data = data.drop_duplicates()
        data = data.reset_index(drop=False, inplace=False)
        return data


    def sal_chars(data):
        """[Reduces Pay values to alphanumeric chars only]

        Args:
            data ([DataFrame]): [in munging]

        Returns:
            [DataFrame]: [Now with no special chars]
        """
        data["Pay"] = data["Pay"].str.replace("\n", "")
        data["Pay"] = data["Pay"].str.replace(",", "")
        data["Pay"] = data["Pay"].str.replace("+", " ")
        data["Pay"] = data["Pay"].str.replace("$", "", regex=False)
        return data


    def Pay_period(data):
        """[Builds a column for rate of Pay so a yearly salary can be computed]

        Args:
            data ([DataFrame]): [in munging]

        Returns:
            [DataFrame]: [Now with Pay periods]
        """
        data['Schedule'] = np.nan
        data['Schedule'] = np.where(data['Pay'].str.contains("hour"),"hour",data['Schedule'])
        data['Schedule'] = np.where(data['Pay'].str.contains("week"),"week",data['Schedule'])
        data['Schedule'] = np.where(data['Pay'].str.contains("day"),"day",data['Schedule'])
        data['Schedule'] = np.where(data['Pay'].str.contains("year"),"year",data['Schedule'])
        data['Schedule'] = np.where(data['Pay'].str.contains("NaN"),np.nan,data['Schedule'])
        return data


    def sal_strings(data):
        """[Reduces Pay values to numeric chars only]

        Args:
            data ([DataFrame]): [in munging]

        Returns:
            [DataFrame]: [Now with only numeric chars in Pay col]
        """
        data["Pay"] = data["Pay"].str.replace(" an hour", "")
        data["Pay"] = data["Pay"].str.replace(" a day", "")
        data["Pay"] = data["Pay"].str.replace(" a week", "")
        data["Pay"] = data["Pay"].str.replace(" a month", "")
        data["Pay"] = data["Pay"].str.replace(" a year", "")
        data["Pay"] = data["Pay"].str.replace(" +", "")
        return data


    def split_sal(i):
        """[Converts salaries given as a range to the average of their min/max]

        Args:
            i ([row]): [applied to 'Pay' column]

        Returns:
            [float]: [If given a range, its mean]
        """
        try:
            lst = i.split(' - ',1)
            x = lst[0]
            y = lst[1]
            return (float(x)+float(y))//2
        except:
            return i
    def split_sal2(i):
        try:
            lst = i.split('-',1)
            x = lst[0]
            y = lst[1]
            return (float(x)+float(y))//2
        except:
            return i

    def from_(i):
        #TODO I think this is too simple, find a way to include data that
        # may fall within a range.
        """[If salary is given with a base amount, returns that]

        Args:
            i ([row]): [applied to 'Pay' column]

        Returns:
            [string]: [Lower limit, if given]
        """
        try:
            lst = i.split('From ',1)
            y = lst[1]
            return (y)
        except:
            return i

        
    def from_2(i):
        #TODO I think this is too simple, find a way to include data that
        # may fall within a range.
        """[If salary is given with a base amount, returns that]

        Args:
            i ([row]): [applied to 'Pay' column]

        Returns:
            [string]: [Lower limit, if given]
        """
        try:
            lst = i.split('From',1)
            y = lst[1]
            return (y)
        except:
            return i
        

    def up_to(i):
        # TODO combine this with the above
        """[If salary is given with a max amount, returns that]

        Args:
            i ([row]): [applied to 'Pay' column]

        Returns:
            [string]: [Upper limit, if given]
        """
        try:
            lst = i.split('Up to ',1)
            y = lst[1]
            return (y)
        except:
            return i
        
    def up_to2(i):
        # TODO combine this with the above
        """[If salary is given with a max amount, returns that]

        Args:
            i ([row]): [applied to 'Pay' column]

        Returns:
            [string]: [Upper limit, if given]
        """
        try:
            lst = i.split('Upto',1)
            y = lst[1]
            return (y)
        except:
            return i



    def pDate(row):
        #TODO 64?
        """[Builds a column for date posted. since Indeed.com only gives values for
        postdate relative to day of query.]

        Args:
            i ([row]): [applied to 'PostDate' column]

        Returns:
            [date]: [The actual date the posting was created]
        """
        days_ago = row['PostDate']
        delta = timedelta(days_ago)
        try:
            return row['ExtractDate'] - delta
        except:
            return row


    def annual(data):
        """[Builds an annual salary feature with values for all data]

        Args:
            data ([DataFrame]): [in munging]

        Returns:
            [DataFrame]: [Now with annual salary values]
        """
        data['Salary'] = np.nan
        data['Salary'] = np.where(data['Schedule'].str.contains("hour"), data['Pay']*365/7*40, data['Salary'])
        data['Salary'] = np.where(data['Schedule'].str.contains("day"), data['Pay']*365/7*5, data['Salary'])
        data['Salary'] = np.where(data['Schedule'].str.contains("week"), data['Pay']*365/7, data['Salary'])
        data['Salary'] = np.where(data['Schedule'].str.contains("month"), data['Pay']*365/12, data['Salary'])
        data['Salary'] = np.where(data['Schedule'].str.contains("year"), data['Pay'], data['Salary'])
        return data


    def acronyms(data):
        """[Spells out some commonly encountered acronyms. Supports accuracy of text analysis.]

        Args:
            data ([DataFrame]): [in munging]

        Returns:
            [DataFrame]: [Now with fewer acronymns]
        """
        data["JobTitle"] = data["JobTitle"].str.replace("R&D", "research development")
        data["Summary"] = data["Summary"].str.replace("R&D", "research development")
        data["Description"] = data["Description"].str.replace("R&D", "research development")
        return data


    def chars(data):
        """[summary]

        Args:
            data ([DataFrame]): [in munging]

        Returns:
            [DataFrame]: [Now without special chars]
        """
        cleaning_list = ["+", "$", "/", ",", "?", ".", ";", ":", "-", "@", "!", "&", "%", "^", "*", ")", "(", "\n"]
        for item in cleaning_list:
            data['PostDate'] = data['PostDate'].str.replace(item, " ", regex=False)
            data['Summary'] = data['Summary'].str.replace(item, " ",regex=False)
            data['Description'] = data['Description'].str.replace(item, " ",regex=False)
        return data


    def postD_int(data):
        """[Reduces or converts relative post dates to numeric chars]

        Args:
            data ([DataFrame]): [in munging]

        Returns:
            [DataFrame]: [Now with only numeric values for post date]
        """
        data["PostDate"] = data["PostDate"].str.replace("Active ", "")
        data["PostDate"] = data["PostDate"].str.replace(" day ago", "")
        data["PostDate"] = data["PostDate"].str.replace("%+ days ago", "")
        data["PostDate"] = data["PostDate"].str.replace("+", "")
        data["PostDate"] = data["PostDate"].str.replace(" days ago", "")
        data["PostDate"] = data["PostDate"].str.replace("Just posted", "0")
        data["PostDate"] = data["PostDate"].str.replace("Today", "0")
        data["PostDate"] = data["PostDate"].str.replace("today", "0")
        data['PostDate'] = data['PostDate'].astype('int')
        return data
    


    def roles(data):
        """[Supports web app display by providing website view table with information
        releavent to the job role.]

        Args:
            data ([DataFrame]): [munged]

        Returns:
            [DataFrame]: [Now with specific jobs and roles for each listing]
        """
        #Primary Role
        data['Role'] = ''
        analyst = ['anal']
        eng = ['big data', 'engin', 'data manag', 'data officer']
        ds = ['data scien', 'ml', 'deep', 'model', 'modeler','machine', 'deep', 'ai', 'scientist']


        data['Role'] = np.where(data['Role'].str.contains(''), 'Other', data['Role'])
        for _ in analyst:
            data['Role'] = np.where(data['JobTitle'].str.contains(_), 'data analyst', data['Role'])
        for _ in eng:
            data['Role'] = np.where(data['JobTitle'].str.contains(_), 'data engineer', data['Role'])
        for _ in ds:
            data['Role'] = np.where(data['JobTitle'].str.contains(_), 'data scientist', data['Role'])


        #Focus
        data['Focus'] = ''
        ml = ['ml', 'deep', 'model', 'modeler','machine', 'deep', 'ai']
        sr = ['sr.', 'lead', 'senior', 'manager']
        applied = ['applied']

        for _ in analyst:
            data['Focus'] = np.where(data['JobTitle'].str.contains(_), 'analysis', data['Focus'])
        for _ in ml:
            data['Focus'] = np.where(data['JobTitle'].str.contains(_), 'machine learning', data['Focus'])
        for _ in sr:
            data['Focus'] = np.where(data['JobTitle'].str.contains(_), 'senior', data['Focus'])
        return data

    # Apply the above functions to the selected DataFrame
    # These are in a neceassary order of operation as many functions require some cleaning or
    # featurization to have occured prior to their call/application.
    #data["State"] = data["Location"].apply(states_)
    


    data['State'] = ''
    data = states_(data)
    data["City"] = data["Location"].apply(cities_)
    data['ExtractDate']= pd.to_datetime(data['ExtractDate'])
    data = chars(data)
    data = postD_int(data)
    data['DatePosted'] = data.apply( lambda row : pDate(row), axis = 1)
    data = deduper(data)
    data = sal_chars(data)
    data = Pay_period(data)
    data = sal_strings(data)
    data["Pay"] = data["Pay"].apply(split_sal)
    data["Pay"] = data["Pay"].apply(split_sal2)
    data["Pay"] = data["Pay"].apply(from_)
    data["Pay"] = data["Pay"].apply(from_2)
    data["Pay"] = data["Pay"].apply(up_to)
    data["Pay"] = data["Pay"].apply(up_to2)
    data['Pay'] = pd.to_numeric(data['Pay'])
    data = annual(data)
    data = acronyms(data)
    #data.Location = data.City +' ,' + data.State


    

    # Drop a few cols we no longer need
    data.drop(columns=['Pay','ExtractDate', 'PostDate'], inplace=True)

    for item in ['JobTitle', 'Company', 'Summary', 'Requirements','Description', 'City']:
        data[item] = data[item].str.lower()
    data = roles(data)
    data.to_csv('../app/data/munged_data.csv', index=False)
    return data




In [None]:
munged = munge()

In [None]:
munged.head(3)

In [None]:
null = len(munged[munged.Salary.isnull()])
nnull = len(munged[munged.Salary.notnull()])
print(f'Droping {len(data)-len(munged)} duplicates and out of date posts leaves {len(munged)} rows of data for the regressor to be trained and tested on.')
print(f'- Of those {len(munged)} job postings {nnull} or {round(nnull/len(munged)*100,2)}% include salary information,\n- The remaining {null} rows, or {round(null/len(munged)*100,2)}% are missing salary data.')

## Detecting Outliers: Using both the Z-Score and IQR methods
    Now that preprocessed the data and building the target variable of Salary let's see conduct a bit more EDA to see if there are any outliers that could potentially skew how the regressor will learn from the training data.

In [None]:
ax = sns.pairplot(data=munged[munged.Salary.notnull()],
                  x_vars=['DatePosted'],
                  y_vars=['Salary'], height=8, hue="Salary", palette="crest")

ax.fig.set_size_inches(18,4)

A plot is helpful in isualizing data, but let's program a few functions to detect and remove outliers based on thresholds.
With the Z-Score, we can determine any data outside 3 standard deviations from the mean of our salary data to be an outlier.
Wheras with the interquartile (IQR) range, the middle 50% of given salaries, I'll set the conventional cutoff of 1.5+/- the IQR as the cutoff.
Values found outside of either range will be collected into a list and that'll allow for some fancy indexing so those particular postings can be examined and removed programatically.

#### Z-Score
Salaries 3 standard deviations away from the mean will be listed as outliers.

In [None]:
outliers = []
def z_detect(munged):
    sample = munged[munged['Salary'].notnull()].Salary
    threshold=3
    mean = np.mean(sample)
    std = np.std(sample)
    
    for i in sample:
        z_score = (i-mean)/std
        if np.abs(z_score) > threshold:
            outliers.append(i)
    if len(outliers) == 0:
        pass
    else:
        return outliers
z_detect(munged)

#### IQR
Salaries outside 1.5 times the interquartile range boundaries, either above or below will be listed as outliers.

In [None]:
def iqr_detect(munged):
    sample = munged[munged['Salary'].notnull()].Salary
    Q1, Q3 = np.percentile(sample,[25,75])
    iqr = Q3-Q1
    lower_bound = Q1-(1.5*iqr)
    upper_bound = Q3+(1.5*iqr)
    for i in sample:
        if (i < lower_bound)  | (i > upper_bound):
            outliers.append(i)
    if len(outliers) == 0:
        pass
    else:
        return outliers
iqr_detect(munged)

In [None]:
def unique(list1):
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    for x in unique_list:
        print(x)
        
mn = min(outliers)
mx = max(outliers)
print(f'The {len(set(outliers))} Unique Outliers Are:')
unique(outliers)
print(f'- With a minumum of ${round(mn)} and a maximum of ${round(mx)}')

In [None]:
# Take a look at those tables.

outliers = munged[munged['Salary'].isin(outliers)]
munged.drop(outliers.index, axis=0,inplace=True)
outliers

In [None]:
null = len(munged[munged.Salary.isnull()])
nnull = len(munged[munged.Salary.notnull()])

In [None]:
ax = sns.pairplot(data=munged[munged.Salary.notnull()],
                  x_vars=['DatePosted'],
                  y_vars=['Salary'], height=8, hue="Salary", palette="crest")

ax.fig.set_size_inches(18,4)

In [None]:
sns.countplot(x='Salary', data=munged.notnull(), palette='Set3')
print(f'- Dropping {len(outliers)} outliers now leaves {nnull}, or {round(nnull/len(munged)*100,2)}%, of rows with with salary information,\n- The remaining {null} rows, or {round(null/len(munged)*100,2)}%, are missing salary data.')


In [None]:
munged.info()

In [None]:
munged.to_csv(f'../app/data/munged_data.csv', index=False)