## Post-block Assignment 1

In [None]:
# !pip install -Uqq plotly

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
PLOTLY_THEME = "plotly_dark"

## 1. Read the data

In [2]:
def read_data(df):

    df = pd.read_csv(df)
    df.columns = df.columns.str.lower().str.replace(' ', '_') # Ensure all columns follow rules of declaring variables in coding
    df = df.drop(axis=1, columns=['unnamed:_0', 'index'])
    return df

df = read_data(df = "data/DataScientist (1).csv")
print(df.shape)
df.sample(5)

(3909, 15)


Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply
1700,Data Engineer,$44K-$80K (Glassdoor est.),The Philadelphia District Attorneys Office (DA...,3.6,Philadelphia District Attorney's Office\n3.6,"Philadelphia, PA","Philadelphia, PA",501 to 1000 employees,-1,Government,Municipal Governments,Government,Unknown / Non-Applicable,-1,-1
1240,Data Engineer,$72K-$97K (Glassdoor est.),"DATA ENGINEER – HOUSTON, TX\n\nWho We Are\n\nA...",-1.0,m1neral,"Houston, TX","Houston, TX",Unknown,-1,Company - Private,-1,-1,Unknown / Non-Applicable,-1,-1
456,"Business Analytics Manager, Enterprise Marketing",$134K-$218K (Glassdoor est.),The Analytics team at Miro has a clear mission...,4.5,Miro\n4.5,"Los Angeles, CA","San Francisco, CA",201 to 500 employees,2015,Company - Private,Computer Hardware & Software,Information Technology,Unknown / Non-Applicable,"Trello, Slack, Lucid Software",True
749,Data Scientist - Life Sciences,$79K-$130K (Glassdoor est.),Data Scientist \- Life Sciences\n*Requisition ...,4.3,Argonne National Laboratory\n4.3,"Lemont, IL","Lemont, IL",1001 to 5000 employees,1946,Nonprofit Organization,Federal Agencies,Government,Unknown / Non-Applicable,"Oak Ridge National Laboratory, Lawrence Liverm...",-1
2121,Sr. Tech Svc Scientist (I),$43K-$76K (Glassdoor est.),"Company Description\n\nSince time began, the e...",-1.0,The Nature’s Bounty Co,"San Antonio, TX",-1,-1,-1,-1,-1,-1,-1,-1,-1


--- 

## 2. Data Preprocessing
- This entails data cleaning, extracting features, imputing missing values, remove noise etc

In [3]:
def clean_salary_estimates(df, column_name='salary_estimate'):
    
    df[column_name] = df[column_name].str.replace(r'\(.*\)', '', regex=True)  # Clean the salary_estimate column by removing extra text in parentheses,remove extra text

    # Remove '$' and 'K', then split by '-'
    df[['min_salary_estimate', 'max_salary_estimate']] = df[column_name].str.replace(r'[^\d-]', '', regex=True) \
                                                     .str.split('-', expand=True)
    # Replace 'K' with '000' to convert to actual values
    df['min_salary_estimate'] = df['min_salary_estimate'].str.replace('K', '').astype(int) * 1000
    df['max_salary_estimate'] = df['max_salary_estimate'].str.replace('K', '').astype(int) * 1000

    return df

In [4]:
def split_revenue(revenue):
    if "to" in revenue:
        return revenue.replace(" (USD)", "").split(" to ")
    elif "Less than" in revenue:
        return ["0", revenue.replace("Less than ", "").replace(" (USD)", "")]
    elif "Unknown" in revenue or revenue == "-1":
        return [None, None]
    elif "$10+ billion" in revenue:
        return ["$10 billion", None]  # Assuming $10+ billion is minimum at 10B
    else:
        return [None, None]

# Apply function to create min_revenue and max_revenue columns
df[["min_revenue", "max_revenue"]] = pd.DataFrame(df["revenue"].apply(split_revenue).tolist(), index=df.index)

In [7]:
def classify_size(size):
    """Classify company size based on employee count."""
    if isinstance(size, str):  # Ensure it's a string
        size = size.replace(" employees", "").strip()
        
        if size == "-1":
            return "Unknown"
        elif "10000+" in size:
            return "Large"
        elif " to " in size:
            min_size, max_size = map(int, size.split(" to "))
            if max_size <= 200:
                return "Small"
            elif max_size <= 1000:
                return "Medium"
            else:
                return "Large"
            
        elif size.isdigit():  # Single number case
            num = int(size)
            if num <= 200:
                return "Small"
            elif num <= 1000:
                return "Medium"
            else:
                return "Large"
    
    return "Unknown"

# Apply company size classification
df["company_size_category"] = df["size"].apply(classify_size)

In [8]:
df = df[~df['sector'].str.contains(r'\d', na=False)] # Remove numbers on feature sector
df = clean_salary_estimates(df)
df.sample(4)

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply,min_revenue,max_revenue,company_size_category,min_salary_estimate,max_salary_estimate
2244,"Scientist, Histology",$83K-$165K,Job Description\nThe Position\n\nThe Scientist...,3.5,Arrowhead Pharmaceuticals\n3.5,"San Diego, CA","Pasadena, CA",51 to 200 employees,2004,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable,-1,-1,,,Small,83000,165000
384,Data Analyst,$113K-$180K,"Job Description\nWe hiring a Data Analyst, for...",-1.0,People Tree,"Hermosa Beach, CA","Hermosa Beach, CA",1 to 50 employees,2013,Self-employed,Staffing & Outsourcing,Business Services,Unknown / Non-Applicable,-1,-1,,,Small,113000,180000
2305,Fleet Optimization Data Analyst (Solar + Storage),$39K-$69K,Fleet Optimization Data Analyst (Solar + Stora...,3.2,Stem\n3.2,"San Diego, CA","Millbrae, CA",51 to 200 employees,2009,Company - Private,Energy,"Oil, Gas, Energy & Utilities",$25 to $50 million (USD),-1,-1,$25,$50 million,Small,39000,69000
1130,"Senior Scientist, BioAssay Services",$36K-$71K,Today Lonza is a global leader in life science...,3.3,Lonza\n3.3,"Houston, TX","Basel, Switzerland",10000+ employees,1897,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$5 to $10 billion (USD),"Perrigo, Covance, DSM",-1,$5,$10 billion,Large,36000,71000


---

<div style="text-align:center; font-size:30px; font-weight:bold;"> Derive insights of strategic value</div>

## Salary By Sector

In [9]:
fig = px.box(data_frame=df, x = "sector", y = "min_salary_estimate")
fig.show()

### Number of job posting by location

In [10]:
roles_by_sectors = df.groupby(['sector'])[['job_title']]\
                     .count()\
                     .reset_index()\
                     .sort_values(by='job_title', ascending=False)\
                     .head(10)
                     
roles_by_sectors = roles_by_sectors.rename(columns={'job_title': 'num_of_job_posting'})

fig = px.bar(roles_by_sectors, x = "sector", y = "num_of_job_posting", template=PLOTLY_THEME, title="Number of Job Posting by Sector")
fig.update_layout(title_x = 0.5)
fig.show()

1. High Demand for Data Science in IT and Business Services

- The **`Information Technology (IT)`** sector has the highest number of job postings. This suggests a strong demand for data science professionals in software development, AI, and cloud computing.
- **`Business Services`** also show high demands for Data Science and related field.



# References:

---

In [None]:
roles_by_company_size = df.groupby(['company_size_category'])[['job_title']]\
                          .count()\
                          .reset_index()\
                          
roles_by_company_size = roles_by_company_size.rename(columns={"job_title": "num_of_job_posting"})                          
                          
roles_by_company_size.head()

In [None]:
fig = px.pie(roles_by_company_size, values='num_of_job_posting', names="company_size_category", template=PLOTLY_THEME)
fig.show()

##### Provide insights
---

#### Get the avg min salary estimate by sector and location

In [None]:

avg_min_salary_by_sector = df.groupby(['sector','location'])[['min_salary_estimate']]\
                             .mean()\
                             .reset_index()\
                             .sort_values(by=['min_salary_estimate'], ascending=False)\
                             .round(2)
	

#print(avg_min_salary_by_sector.head(10)) # Top 10 of most paying sectors by sector and location

fig = px.bar(data_frame=avg_min_salary_by_sector.head(10).sort_values(by='sector', ascending=False),
              x = "sector",
              y = "min_salary_estimate",
              color="location",
              title="Average min salary by sector and location", 
              template=PLOTLY_THEME)
fig.update_layout(title_x=0.5)
fig.show()

## 1. High-Paying Sectors and Locations
- Finance stands out as the sector with the highest minimum salary estimates, particularly in Santa Clara, CA.
- Information Technology (IT), a major sector for data science roles, shows a strong salary offering, particularly in Redwood City, CA.
- Real Estate, Health Care, and Business Services also offer competitive salaries, indicating potential demand for data analytics and AI-driven decision-making.


## 2. Data Science Job Monetization Opportunity
- The Finance and IT sectors are particularly lucrative for data science professionals, making them prime targets for recruitment solutions.
- Santa Clara and Redwood City, CA, seem to be key locations offering high-paying jobs, meaning job postings from these locations could be prioritized for insights and monetization.

---

#### Demand for Data Science Roles by Company Size

In [None]:
demand_by_company_size = df.groupby(['industry', 'company_size_category'])[['job_title']]\
                      .count()\
                      .reset_index()\
                      .sort_values(by = "job_title", ascending=False)

demand_by_company_size.head(10)

In [None]:
#Salary Trends Based on Company Size

avg_min_salary_by_comany_size = df.groupby(['company_size_category'])[['max_salary_estimate']]\
                                  .median()\
                                  .reset_index()\
                                  .round(2)


avg_min_salary_by_comany_size

In [None]:
fig = px.bar(data_frame=avg_min_salary_by_comany_size,
             x='company_size_category', 
             y='max_salary_estimate')
fig.show()

### How many Job Posting by Industry and Location?

In [None]:
roles_by_industry = df.groupby(['industry', 'company_size_category'])[['job_title']]\
                      .count()\
                      .reset_index()\
                      .sort_values(by = "job_title", ascending=False)

In [None]:
fig = px.bar(data_frame=roles_by_industry.head(10), x = "industry",
                                                    y = "job_title",
                                                    color = "company_size_category",
                                                    title="Number of Job Posting by location and industry", 
                                                    template=PLOTLY_THEME)
fig.update_layout(title_x=0.5)
fig.show()