In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('scraped_data.csv')

In [3]:
# drops duplicates by finding matches from the combined ['company','title'] columns
df = df.drop_duplicates(subset=['company', 'title'])

In [4]:
# create ['js_count','python_count'] columns with counts of each showing up in job descriptions
descriptions = df['description']
df['js_count'] = descriptions.str.count(r'[jJ]ava[sS]cript')
df['python_count'] = descriptions.str.count(r'[Pp]ython')
df[['js_count', 'python_count']] = df[['js_count', 'python_count']].fillna(0).astype(int)

In [5]:
def format_hourly(rate):
    '''Returns annual salaries converted from hourly rates'''
    return rate * 2000 if rate > 0 and rate < 1000 else rate

In [6]:
# creates the 'formatted_sal' column by finding and formatting high range of scraped salary data
sal_cap_groups = df['salary'].str.extract(r'\d\d\d?,\d\d\d\s-\s\$(\d\d\d?,\d\d\d)|\d\d\s-\s\$(\d\d)')
combined_sal_groups = sal_cap_groups[0].combine_first(sal_cap_groups[1]).str.replace(',', '')
df['formatted_sal'] = combined_sal_groups.fillna(0).astype(int).map(format_hourly).replace(0, np.nan)
# df_dd['formatted_sal'] = combined.fillna(0).astype(int).map(lambda num: num * 2000 if num > 0 and num < 1000 else num).replace(0, np.nan)

In [7]:
# groups the data by city and technology and returns average salaries for each
grouped = df.groupby(['search_loc', 'search_terms'])
market_sals = grouped['formatted_sal'].mean().sort_values(ascending=False).astype(int)

In [8]:
market_sals.to_csv('seed_market_sals.csv', encoding='utf-8-sig')

In [9]:
df.to_csv('seed_positions.csv',index=False,encoding='utf-8-sig')