## Post-block Assignment 1

In [37]:
!pip install -Uqq plotly

In [38]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
PLOTLY_THEME = "plotly_dark"

## 1. Read the data

In [39]:
def read_data(df):
    df = pd.read_csv(df)
    df.columns = df.columns.str.lower().str.replace(' ', '_') # Ensure all columns follow rules of declaring variables in coding
    df = df.drop(axis=1, columns=['unnamed:_0', 'index'])

    return df

df = read_data(df = "data/DataScientist (1).csv")
print(df.shape)
df.head(3)

(3909, 15)


Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply
0,Senior Data Scientist,$111K-$181K (Glassdoor est.),"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ...",3.5,Hopper\n3.5,"New York, NY","Montreal, Canada",501 to 1000 employees,2007,Company - Private,Travel Agencies,Travel & Tourism,Unknown / Non-Applicable,-1,-1
1,"Data Scientist, Product Analytics",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ...",4.5,Noom US\n4.5,"New York, NY","New York, NY",1001 to 5000 employees,2008,Company - Private,"Health, Beauty, & Fitness",Consumer Services,Unknown / Non-Applicable,-1,-1
2,Data Science Manager,$111K-$181K (Glassdoor est.),Decode_M\n\nhttps://www.decode-m.com/\n\nData ...,-1.0,Decode_M,"New York, NY","New York, NY",1 to 50 employees,-1,Unknown,-1,-1,Unknown / Non-Applicable,-1,True


In [40]:
df[df.index == 3827]

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply
3827,Data Engineer,$55K-$113K (Glassdoor est.),"Greetings,\n\nJob Title- Data Engineer ( Netez...",-1.0,R3 Tek,"Columbus, OH",-1,-1,-1,-1,-1,-1,-1,-1,-1


--- 

## 2. Data Preprocessing
- This entails data cleaning, extracting features, imputing missing values, remove noise etc

In [41]:
def clean_salary_estimates(df, column_name='salary_estimate'):
    
    df[column_name] = df[column_name].str.replace(r'\(.*\)', '', regex=True)  # Clean the salary_estimate column by removing extra text in parentheses,remove extra text

    # Remove '$' and 'K', then split by '-'
    df[['min_salary_estimate', 'max_salary_estimate']] = df[column_name].str.replace(r'[^\d-]', '', regex=True) \
                                                     .str.split('-', expand=True)
    # Replace 'K' with '000' to convert to actual values
    df['min_salary_estimate'] = df['min_salary_estimate'].str.replace('K', '').astype(int) * 1000
    df['max_salary_estimate'] = df['max_salary_estimate'].str.replace('K', '').astype(int) * 1000

    return df

In [42]:
df = df[~df['sector'].str.contains(r'\d', na=False)] # Remove numbers on feature sector
df = clean_salary_estimates(df)
df.sample(4)

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply,min_salary_estimate,max_salary_estimate
3509,Ad Integrity Data Analyst-ATX,$50K-$110K,Ad Integrity Data Analyst-ATX\nAustinOperation...,3.8,ByteDance\n3.8,"Austin, TX","Beijing, China",10000+ employees,2012,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,-1,-1,50000,110000
3609,Data Architect II,$54K-$88K,Education\nDegree Type Major/Certification Req...,4.3,Abacus Service Corporation\n4.3,"Jacksonville, FL","Southfield, MI",501 to 1000 employees,2004,Company - Private,Staffing & Outsourcing,Business Services,Unknown / Non-Applicable,-1,-1,54000,88000
462,"Research Scientist, USC Schaeffer Center",$134K-$218K,In order for your application to be considered...,4.2,University of Southern California\n4.2,"Los Angeles, CA","Los Angeles, CA",10000+ employees,1880,College / University,Colleges & Universities,Education,$2 to $5 billion (USD),-1,-1,134000,218000
60,Data Scientist - Machine Learning Platform,$119K-$147K,The Platform team creates the technology that ...,3.8,Spotify\n3.8,"New York, NY","Stockholm, Sweden",1001 to 5000 employees,2006,Company - Public,Internet,Information Technology,Unknown / Non-Applicable,-1,-1,119000,147000


---

<div style="text-align:center; font-size:30px; font-weight:bold;"> Derive insights of strategic value</div>

#### Get the avg min salary estimate by sector and location

In [43]:

avg_min_salary_by_sector = df.groupby(['sector','location'])[['min_salary_estimate']]\
                             .mean()\
                             .reset_index()\
                             .sort_values(by=['min_salary_estimate'], ascending=False)\
                             .round(2)
	

#print(avg_min_salary_by_sector.head(10)) # Top 10 of most paying sectors by sector and location

fig = px.bar(data_frame=avg_min_salary_by_sector.head(10).sort_values(by='sector', ascending=False),
              x = "sector",
              y = "min_salary_estimate",
              color="location",
              title="Average min salary by sector and location", 
              template=PLOTLY_THEME)
fig.update_layout(title_x=0.5)
fig.show()

## 1. High-Paying Sectors and Locations
- Finance stands out as the sector with the highest minimum salary estimates, particularly in Santa Clara, CA.
- Information Technology (IT), a major sector for data science roles, shows a strong salary offering, particularly in Redwood City, CA.
- Real Estate, Health Care, and Business Services also offer competitive salaries, indicating potential demand for data analytics and AI-driven decision-making.


## 2. Data Science Job Monetization Opportunity
- The Finance and IT sectors are particularly lucrative for data science professionals, making them prime targets for recruitment solutions.
- Santa Clara and Redwood City, CA, seem to be key locations offering high-paying jobs, meaning job postings from these locations could be prioritized for insights and monetization.

---

### Company Size Analysis

Company Size and Salaries
Large corporations ($10B+ revenue) like Gilead Sciences and Tata Elxsi offer high salaries.
Startups or smaller companies ($10M–$500M) tend to offer lower salaries but may provide more flexible hiring options.
Insight: A recruitment platform could categorize jobs by company size, helping job seekers choose between salary vs. career growth opportunities

In [44]:
def classify_size(size):
    """Classify company size"""

    if "10000+" in size:
        return "Large"
    elif "to" in size:
        min_size, max_size = map(int, size.replace(" employees", "").split(" to "))
        if max_size <= 200:
            return "Small"
        else:
            return "Medium"
    return "Unknown"

# Apply company size classification
df["company_size_category"] = df["size"].apply(classify_size)

In [45]:
df[["size", "company_size_category"]].sample(10)

Unnamed: 0,size,company_size_category
506,1 to 50 employees,Small
3839,201 to 500 employees,Medium
2406,10000+ employees,Large
3574,51 to 200 employees,Small
56,1001 to 5000 employees,Medium
395,1001 to 5000 employees,Medium
2431,10000+ employees,Large
2234,10000+ employees,Large
1483,201 to 500 employees,Medium
3147,5001 to 10000 employees,Medium


### How many Job Posting by Industry and Location?

In [48]:
roles_by_industry = df.groupby(['industry', 'company_size_category'])[['job_title']]\
                      .count()\
                      .reset_index()\
                      .sort_values(by = "job_title", ascending=False)

roles_by_industry.head(10)

Unnamed: 0,industry,company_size_category,job_title
101,IT Services,Medium,210
102,IT Services,Small,171
173,Staffing & Outsourcing,Medium,169
20,Biotech & Pharmaceuticals,Large,125
174,Staffing & Outsourcing,Small,121
21,Biotech & Pharmaceuticals,Medium,111
113,Internet,Large,105
38,Computer Hardware & Software,Small,102
94,Health Care Services & Hospitals,Medium,91
100,IT Services,Large,90


In [51]:
fig = px.bar(data_frame=roles_by_industry.head(10), x = "industry",
                                                    y = "job_title",
                                                    color = "company_size_category",
                                                    title="Number of Job Posting by location and industry", 
                                                    template=PLOTLY_THEME)
fig.update_layout(title_x=0.5)
fig.show()