## Post-block Assignment 1

In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [12]:
def read_data(df):
    df = pd.read_csv(df)
    df.columns = df.columns.str.lower().str.replace(' ', '_') # Ensure all columns follow rules of declaring variables in coding
    df = df.drop(axis=1, columns=['unnamed:_0', 'index'])

    return df

In [13]:
df = read_data(df = "data/DataScientist (1).csv")
print(df.shape)
df.head()

(3909, 15)


Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply
0,Senior Data Scientist,$111K-$181K (Glassdoor est.),"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ...",3.5,Hopper\n3.5,"New York, NY","Montreal, Canada",501 to 1000 employees,2007,Company - Private,Travel Agencies,Travel & Tourism,Unknown / Non-Applicable,-1,-1
1,"Data Scientist, Product Analytics",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ...",4.5,Noom US\n4.5,"New York, NY","New York, NY",1001 to 5000 employees,2008,Company - Private,"Health, Beauty, & Fitness",Consumer Services,Unknown / Non-Applicable,-1,-1
2,Data Science Manager,$111K-$181K (Glassdoor est.),Decode_M\n\nhttps://www.decode-m.com/\n\nData ...,-1.0,Decode_M,"New York, NY","New York, NY",1 to 50 employees,-1,Unknown,-1,-1,Unknown / Non-Applicable,-1,True
3,Data Analyst,$111K-$181K (Glassdoor est.),Sapphire Digital seeks a dynamic and driven mi...,3.4,Sapphire Digital\n3.4,"Lyndhurst, NJ","Lyndhurst, NJ",201 to 500 employees,2019,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,"Zocdoc, Healthgrades",-1
4,"Director, Data Science",$111K-$181K (Glassdoor est.),"Director, Data Science - (200537)\nDescription...",3.4,United Entertainment Group\n3.4,"New York, NY","New York, NY",51 to 200 employees,2007,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"BBDO, Grey Group, Droga5",-1


In [14]:
def clean_salary_estimates(df, column_name='salary_estimate'):
    
    df[column_name] = df[column_name].str.replace(r'\(.*\)', '', regex=True)  # Clean the salary_estimate column by removing extra text in parentheses,remove extra text

    # Remove '$' and 'K', then split by '-'
    df[['min_salary_estimate', 'max_salary_estimate']] = df[column_name].str.replace(r'[^\d-]', '', regex=True) \
                                                     .str.split('-', expand=True)
    # Replace 'K' with '000' to convert to actual values
    df['min_salary_estimate'] = df['min_salary_estimate'].str.replace('K', '').astype(int) * 1000
    df['max_salary_estimate'] = df['max_salary_estimate'].str.replace('K', '').astype(int) * 1000

    return df

In [15]:
df = clean_salary_estimates(df)
df.head()

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply,min_salary_estimate,max_salary_estimate
0,Senior Data Scientist,$111K-$181K,"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ...",3.5,Hopper\n3.5,"New York, NY","Montreal, Canada",501 to 1000 employees,2007,Company - Private,Travel Agencies,Travel & Tourism,Unknown / Non-Applicable,-1,-1,111000,181000
1,"Data Scientist, Product Analytics",$111K-$181K,"At Noom, we use scientifically proven methods ...",4.5,Noom US\n4.5,"New York, NY","New York, NY",1001 to 5000 employees,2008,Company - Private,"Health, Beauty, & Fitness",Consumer Services,Unknown / Non-Applicable,-1,-1,111000,181000
2,Data Science Manager,$111K-$181K,Decode_M\n\nhttps://www.decode-m.com/\n\nData ...,-1.0,Decode_M,"New York, NY","New York, NY",1 to 50 employees,-1,Unknown,-1,-1,Unknown / Non-Applicable,-1,True,111000,181000
3,Data Analyst,$111K-$181K,Sapphire Digital seeks a dynamic and driven mi...,3.4,Sapphire Digital\n3.4,"Lyndhurst, NJ","Lyndhurst, NJ",201 to 500 employees,2019,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,"Zocdoc, Healthgrades",-1,111000,181000
4,"Director, Data Science",$111K-$181K,"Director, Data Science - (200537)\nDescription...",3.4,United Entertainment Group\n3.4,"New York, NY","New York, NY",51 to 200 employees,2007,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"BBDO, Grey Group, Droga5",-1,111000,181000


In [16]:
df = df[~df['sector'].str.contains(r'\d', na=False)] # Remove numbers on feature sector

In [17]:
df = clean_salary_estimates(df)
df.sample(5)

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply,min_salary_estimate,max_salary_estimate
1162,Product and Data Analyst,$55K-$117K,"Entera, where residential real estate investin...",-1.0,Entera,"Houston, TX","New York, NY",Unknown,-1,Company - Private,Real Estate,Real Estate,Unknown / Non-Applicable,-1,-1,55000,117000
3548,RESEARCH COMPUTER SCIENTIST - RESEARCH ENGINEE...,$62K-$112K,Join the exciting new area of developing the a...,4.1,Southwest Research Institute\n4.1,"Austin, TX","San Antonio, TX",1001 to 5000 employees,1947,Nonprofit Organization,Research & Development,Business Services,$500 million to $1 billion (USD),"Los Alamos National Laboratory, Battelle, SRI ...",-1,62000,112000
527,Data Engineer,$132K-$208K,Pluto TV is the leading free streaming televis...,3.6,ViacomCBS\n3.6,"Los Angeles, CA","New York, NY",10000+ employees,2019,Company - Public,Motion Picture Production & Distribution,Media,$10+ billion (USD),"NBCUniversal, WarnerMedia, Netflix",-1,132000,208000
524,Sr. Data Scientist,$132K-$208K,**CANDIDATES WILL NOT BE CONSIDERED WITHOUT AN...,3.7,"Icon Media Direct, Inc.\n3.7","Sherman Oaks, CA","Van Nuys, CA",51 to 200 employees,2000,Company - Private,Advertising & Marketing,Business Services,$5 to $10 million (USD),-1,-1,132000,208000
1618,Data Engineer I,$93K-$151K,We are looking for a motivated Data Engineer w...,3.1,Integrated Resources\n3.1,"Philadelphia, PA","Edison, NJ",201 to 500 employees,1996,Company - Private,Staffing & Outsourcing,Business Services,$25 to $50 million (USD),-1,-1,93000,151000


###
#
# Get the company size, by industy



In [28]:
# roles_by_industry = df.groupby(['industry', 'type_of_ownership'])[['job_title']]\
#                       .count()\
#                       .reset_index()\
#                       .sort_values(by = "job_title", ascending=False)


# roles_by_industry.head(10)

### How many Job Posting by Industry?

In [None]:
roles_by_industry = df.groupby(['industry', 'location'])[['job_title']]\
                      .count()\
                      .reset_index()\
                      .sort_values(by = "job_title", ascending=False)

roles_by_industry.head(10)

Unnamed: 0,industry,location,job_title
132,Biotech & Pharmaceuticals,"San Diego, CA",91
182,Computer Hardware & Software,"Austin, TX",53
585,Internet,"New York, NY",49
504,IT Services,"Phoenix, AZ",42
476,IT Services,"Chicago, IL",42
212,Computer Hardware & Software,"Santa Clara, CA",42
468,IT Services,"Austin, TX",40
490,IT Services,"Irving, TX",39
481,IT Services,"Dallas, TX",35
125,Biotech & Pharmaceuticals,"Philadelphia, PA",34


In [None]:
fig = px.bar(data_frame=roles_by_industry.head(20), x = "industry",
                                                    y = "job_title",
                                                    color="location",
                                                    title="Number of Job Posting by location and industry");
fig.show();

In [None]:
avg_min_salary_by_sector = df.groupby(['sector'])[['min_salary_estimate']]\
                             .mean()\
                             .reset_index()\
                             .sort_values(by=['min_salary_estimate'], ascending=False)\
                             .round(2)
	

avg_min_salary_by_sector.head(10)

In [None]:
#
#
#
#