In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('scraped_data.csv')

In [3]:
# drops duplicates by finding matches from the combined ['company','title'] columns
df = df.drop_duplicates(subset=['company', 'title'])

In [17]:
# create ['js_count','python_count','ruby_count','java_count'] columns with counts of each showing up in job descriptions
descriptions = df['description']
df['js_count'] = descriptions.str.count(r'[jJ]ava[sS]cript\b')
df['python_count'] = descriptions.str.count(r'[Pp]ython\b')
df['ruby_count'] = descriptions.str.count(r'[Rr]uby\b')
df['java_count'] = descriptions.str.count(r'[Jj]ava\b')
df[['js_count', 'python_count', 'ruby_count', 'java_count']] = \
    df[['js_count', 'python_count', 'ruby_count', 'java_count']].fillna(0).astype(int)

In [27]:
# df[['search_terms', 'title', 'company', 'js_count', 'python_count', 'ruby_count', 'java_count']].sort_values(by=['ruby_count'], ascending=False).head(50)

Unnamed: 0,search_terms,title,company,js_count,python_count,ruby_count,java_count
11329,ruby developer,REMOTE-Back End Software Engineer- Ruby- Ruby ...,CyberCoders,0,0,11,0
3663,ruby developer,Ruby on Rails Developer,EGlobalTech,1,0,7,0
2376,ruby developer,Backend Ruby Developer,CyberCoders,1,0,7,0
2600,ruby developer,Senior Ruby on Rails Engineer,Lab Zero,2,0,6,1
1778,ruby developer,"Staff or Lead Software Engineer, Ruby (Billing...",BigCommerce,2,1,6,0
244,ruby developer,Ruby on Rails Developer (Remote),OneClickPolitics,1,0,6,0
11414,ruby developer,Senior Ruby on Rails Developer - 100% Remote,CyberCoders,0,1,6,0
11943,java developer,"0136 Systems Developer, Herndon, VA - ST/SCI FSP",StellarPeak Corp,0,5,6,7
12014,java developer,"0134 DevOps SME, Herndon, VA - TS/SCI FSP",StellarPeak Corp,0,5,6,7
3736,ruby developer,Ruby on Rails Engineer,EvoTech,1,0,6,0


In [18]:
# recategorizes 'search_terms' for each posting based on highest tech count in description
highest_tcount = df[['js_count', 'python_count', 'ruby_count', 'java_count']].idxmax(axis=1)
mapper = {
    'js_count': 'javascript developer', 
    'python_count': 'python developer', 
    'ruby_count': 'ruby developer', 
    'java_count': 'java developer'
}
df['search_terms'] = highest_tcount.map(lambda x: mapper[x])

In [7]:
def format_hourly(rate):
    '''Returns annual salaries converted from hourly rates'''
    return rate * 2000 if rate > 0 and rate < 1000 else rate

In [19]:
# creates the 'formatted_sal' column by finding and formatting high range of scraped salary data
sal_cap_groups = df['salary'].str.extract(r'\d\d\d?,\d\d\d\s-\s\$(\d\d\d?,\d\d\d)|\d\d\s-\s\$(\d\d)')
combined_sal_groups = sal_cap_groups[0].combine_first(sal_cap_groups[1]).str.replace(',', '')
df['formatted_sal'] = combined_sal_groups.fillna(0).astype(int).map(format_hourly).replace(0, np.nan).astype('Int64')
# df_dd['formatted_sal'] = combined.fillna(0).astype(int).map(lambda num: num * 2000 if num > 0 and num < 1000 else num).replace(0, np.nan)

In [29]:
# groups the data by city and technology and returns average salaries for each
grouped = df.groupby(['search_loc', 'search_terms'])
market_sals = grouped['formatted_sal'].mean().sort_values(ascending=False).astype(int)

In [30]:
# counts the overall metrics for position counts by tech and amount of total positions per market
pcount_by_tech = df['search_terms'].value_counts()
pcount_by_loc = df['search_loc'].value_counts()

In [11]:
# finding salaried ruby jobs in washington
# pcount_by_loc = df[['search_loc','formatted_sal','search_terms']].value_counts()
# dc_counts = pcount_by_loc['Washington, DC'].reset_index()
# dc_counts[pcount_by_loc['Washington, DC'].reset_index()['search_terms'] == 'ruby developer']

In [28]:
# creates new dataframe with position counts per market, position % breakdown per market, & overall marketshare of tech by market
pos_metrics_mkt = pd.DataFrame(columns=['pos_counts_mkt', 'pos_pcts_mkt', 'pos_overall_mkt_pct'])
pos_metrics_mkt['pos_pcts_mkt'] = round(df.groupby(['search_loc'])['search_terms'].value_counts(normalize=True) * 100, 2)
pos_metrics_mkt['pos_counts_mkt'] = df.groupby(['search_loc'])['search_terms'].value_counts()
pos_metrics_mkt['pos_overall_mkt_pct'] = round(pos_metrics_mkt['pos_counts_mkt'] / pos_metrics_mkt['pos_counts_mkt'].sum() * 100, 2)
pos_metrics_mkt.reset_index(inplace=True)

In [13]:
pos_metrics_mkt.to_csv('seed_pos_metrics_mkt.csv', encoding='utf-8-sig')
market_sals.to_csv('seed_market_sals.csv', encoding='utf-8-sig')
df.to_csv('seed_positions.csv',index=False,encoding='utf-8-sig')