## Post-block Assignment 1

In [4]:
# !pip install -Uqq plotly
# !pip install -Uqq nbformat

In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
PLOTLY_THEME = "plotly_dark"

## 1. Read the data

In [6]:
def read_data(df):

    df = pd.read_csv(df)
    df.columns = df.columns.str.lower().str.replace(' ', '_') # Ensure all columns follow rules of declaring variables in coding
    df = df.drop(axis=1, columns=['unnamed:_0', 'index'])
    return df

df = read_data(df = "data/DataScientist (1).csv")
print(df.shape)
df.sample(5)

(3909, 15)


Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply
1927,Data Engineer (Python and SQL),$55K-$101K (Glassdoor est.),Job Description\nPosition: Python Data Enginee...,5.0,"Queen Consulting Group, Inc.\n5.0","Philadelphia, PA","Boston, MA",51 to 200 employees,-1,Unknown,-1,-1,$5 to $10 million (USD),-1,-1
3617,Senior Environmental Scientist / Ecologist,$54K-$88K (Glassdoor est.),Senior Environmental Scientist / Ecologist - E...,2.7,"Groundwater & Environmental Services, Inc.\n2.7","Jacksonville, FL","Wall Township, NJ",501 to 1000 employees,1985,Company - Private,Architectural & Engineering Services,Business Services,$50 to $100 million (USD),-1,-1
107,Data Scientist / Statistician,$102K-$121K (Glassdoor est.),Job title: Data Scientist / Statistician\nJob ...,4.2,NJF Global Holdings\n4.2,"New York, NY","London, United Kingdom",51 to 200 employees,2003,Company - Private,Staffing & Outsourcing,Business Services,$10 to $25 million (USD),-1,True
3400,Jr. Data Engineer,$82K-$129K(Employer est.),"Jr. Data Engineer\n\nAustin, TX\n\nIPT's Techn...",3.6,IPT Associates\n3.6,"Austin, TX","Billerica, MA",51 to 200 employees,1992,Company - Private,Consulting,Business Services,Unknown / Non-Applicable,-1,-1
1497,Senior Thermal Modeling Engineer,$47K-$73K (Glassdoor est.),You’re passionate about a zero-emissions futur...,4.6,Nikola Motor Company\n4.6,"Phoenix, AZ","Phoenix, AZ",201 to 500 employees,2014,Company - Public,Transportation Equipment Manufacturing,Manufacturing,Less than $1 million (USD),-1,-1


--- 

## 2. Data Preprocessing
- This entails data cleaning, extracting features, imputing missing values, remove noise etc

In [7]:
def clean_salary_estimates(df, column_name='salary_estimate'):
    
    df[column_name] = df[column_name].str.replace(r'\(.*\)', '', regex=True)  # Clean the salary_estimate column by removing extra text in parentheses,remove extra text

    # Remove '$' and 'K', then split by '-'
    df[['min_salary_estimate', 'max_salary_estimate']] = df[column_name].str.replace(r'[^\d-]', '', regex=True) \
                                                     .str.split('-', expand=True)
    # Replace 'K' with '000' to convert to actual values
    df['min_salary_estimate'] = df['min_salary_estimate'].str.replace('K', '').astype(int) * 1000
    df['max_salary_estimate'] = df['max_salary_estimate'].str.replace('K', '').astype(int) * 1000

    return df

In [8]:
def split_revenue(revenue):
    if "to" in revenue:
        return revenue.replace(" (USD)", "").split(" to ")
    elif "Less than" in revenue:
        return ["0", revenue.replace("Less than ", "").replace(" (USD)", "")]
    elif "Unknown" in revenue or revenue == "-1":
        return [None, None]
    elif "$10+ billion" in revenue:
        return ["$10 billion", None]  # Assuming $10+ billion is minimum at 10B
    else:
        return [None, None]

# Apply function to create min_revenue and max_revenue columns
df[["min_revenue", "max_revenue"]] = pd.DataFrame(df["revenue"].apply(split_revenue).tolist(), index=df.index)

In [9]:
def classify_size(size):
    """Classify company size based on employee count."""
    if isinstance(size, str):  # Ensure it's a string
        size = size.replace(" employees", "").strip()
        
        if size == "-1":
            return "Unknown"
        elif "10000+" in size:
            return "Large"
        elif " to " in size:
            min_size, max_size = map(int, size.split(" to "))
            if max_size <= 200:
                return "Small"
            elif max_size <= 1000:
                return "Medium"
            else:
                return "Large"
            
        elif size.isdigit():  # Single number case
            num = int(size)
            if num <= 200:
                return "Small"
            elif num <= 1000:
                return "Medium"
            else:
                return "Large"
    
    return "Unknown"

# Apply company size classification
df["company_size_category"] = df["size"].apply(classify_size)

In [10]:
df = df[~df['sector'].str.contains(r'\d', na=False)] # Remove numbers on feature sector
df = clean_salary_estimates(df)
df.sample(4)

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply,min_revenue,max_revenue,company_size_category,min_salary_estimate,max_salary_estimate
3032,Senior Data Scientist (San Jose OR Austin),$135K-$214K,Power the Possibilities\nThe CDK Global techno...,3.1,CDK Global\n3.1,"San Jose, CA","Hoffman Estates, IL",5001 to 10000 employees,2014,Company - Public,Computer Hardware & Software,Information Technology,$2 to $5 billion (USD),"Reynolds and Reynolds, Cox Automotive, Dealer.com",-1,$2,$5 billion,Large,135000,214000
2640,Data Engineer,$85K-$159K,Data Engineer As a Junior Big Data Engineer Da...,3.4,IQuest Solutions Corp\n3.4,"Plano, TX","Waukee, IA",1 to 50 employees,-1,Company - Private,IT Services,Information Technology,Less than $1 million (USD),-1,-1,0,$1 million,Small,85000,159000
2043,Research Scientist II,$83K-$154K,Description\n\nJob Description:\nLeidos' Bio-b...,3.5,Leidos\n3.5,"San Antonio, TX","Reston, VA",10000+ employees,1969,Company - Public,Aerospace & Defense,Aerospace & Defense,$10+ billion (USD),-1,-1,$10 billion,,Large,83000,154000
648,Business Intelligence Analyst,$113K-$184K,Meet REVOLVE:\n\nREVOLVE is the next-generatio...,3.4,"EMINENT, INC.\n3.4","Cerritos, CA","Williamsville, NY",1 to 50 employees,-1,Company - Private,IT Services,Information Technology,$1 to $5 million (USD),-1,-1,$1,$5 million,Small,113000,184000


---

<div style="text-align:center; font-size:30px; font-weight:bold;"> Derive insights of strategic value</div>

## Salary By Sector

In [11]:
# fig = px.box(data_frame=df, x = "sector", y = "min_salary_estimate")
# fig.show()

### Number of job posting by location

In [12]:
roles_by_sectors = df.groupby(['sector'])[['job_title']]\
                     .count()\
                     .reset_index()\
                     .sort_values(by='job_title', ascending=False)\
                     .head(10)
                     
roles_by_sectors = roles_by_sectors.rename(columns={'job_title': 'num_of_job_posting'})

fig = px.bar(roles_by_sectors, x = "sector", y = "num_of_job_posting", template=PLOTLY_THEME, title="Number of Job Posting by Sector")
fig.update_layout(title_x = 0.5)
fig.show()

In [13]:
df.head()
df[df.sector  == "Business Services"][["sector", "industry"]]
 


Unnamed: 0,sector,industry
4,Business Services,Advertising & Marketing
15,Business Services,Advertising & Marketing
17,Business Services,Consulting
19,Business Services,Research & Development
24,Business Services,Advertising & Marketing
...,...,...
3881,Business Services,Staffing & Outsourcing
3888,Business Services,Staffing & Outsourcing
3899,Business Services,Staffing & Outsourcing
3900,Business Services,Staffing & Outsourcing


1. High Demand for Data Science in IT and Business Services

- The **`Information Technology (IT)`** sector has the highest number of job postings. This suggests a strong demand for data science professionals in software development, AI, and cloud computing.

- **`Business Services`** is a Major Employer: The second-largest sector is Business Services, including consulting,advertising and marketing, staffing firms, which highlights outsourcing trends.

- Growing Demand in Biotech & Pharmaceuticals: With AI-driven healthcare analytics and drug discovery, data science hiring in Biotech & Pharma is becoming more prominent.

- Finance & Healthcare Are Expanding: Traditional industries such as Finance and Health Care have significant hiring activity, signaling an increasing reliance on data-driven decision-making in risk assessment, fraud detection, and patient care optimization.

- Emerging Sectors (Insurance, Education, Government, Media, Manufacturing): While these sectors have fewer postings compared to IT and Business Services, they represent growth areas where data-driven solutions are being integrated.

2. Strategic Recommendations for Investors & Recruitment Firms

#### For Investors:
- Investment in Data Science Talent Platforms: Given the high demand across industries, investing in AI-driven recruitment platforms tailored for data science roles can be lucrative.

- Targeted Training & Upskilling Platforms: The rise of data science in non-tech sectors (e.g., Healthcare, Finance, Manufacturing) presents an opportunity to invest in industry-specific data science training platforms.

#### For Recruitment Companies:
- `Niche Recruiting for High-Demand Sectors`: Since IT, Business Services, and Finance are major hirers, recruiters should build specialized pipelines for these industries.

#### Data Monetization Strategies

Strategy	Monetization Approach

Recruitment Data Dashboards	Offer subscription-based dashboards for companies to track real-time hiring trends in data science

Industry-Specific Job Boards	Build custom job boards focused on Finance, Biotech, and Healthcare, sectors where specialized hiring is growing

AI-Driven Candidate Matching	Develop a machine learning-based recruitment tool to match job seekers with the most suitable industry roles

Corporate Workforce Planning Insights	Sell industry-specific hiring forecasts to businesses so they can optimize workforce planning

Data Science Salary Benchmarking	Provide premium salary analytics for companies to benchmark compensation trends by industry

Hiring Strategy Consulting	Offer consulting packages to large companies and startups that need data science recruitment expertise









# References:
- https://www.cobloom.com/careers-blog/data-science-salaries-how-much-can-you-really-earn#:~:text=Some%20sectors%E2%80%94like%20healthcare%2C%20autonomous%20vehicles%2C,both%20pay%20and%20professional%20fulfillment



---

In [14]:
roles_by_company_size = df.groupby(['company_size_category'])[['job_title']]\
                          .count()\
                          .reset_index()\
                          
roles_by_company_size = roles_by_company_size.rename(columns={"job_title": "num_of_job_posting"})                          
                          
roles_by_company_size.head()

Unnamed: 0,company_size_category,num_of_job_posting
0,Large,1738
1,Medium,696
2,Small,914
3,Unknown,15


In [26]:
fig = px.pie(roles_by_company_size, values='num_of_job_posting', names="company_size_category", template=PLOTLY_THEME,title="Company Size Distribution")
fig.update_layout(title_x = 0.5)
fig.show()

#### For Investors:
- Invest in AI-powered Recruitment Solutions for Large Enterprises
- Given that large corporations dominate hiring (51.7%), investing in data-driven recruitment platforms (e.g., AI resume screening, automated job matching, and workforce analytics) can be highly lucrative.

- `**Support Recruitment Marketplaces for Startups & SMBs:**` Startups and small businesses (27.2%) need fast and cost-effective hiring solutions.
- Investing in niche job boards or AI-driven applicant tracking systems (ATS) focused on startups can generate subscription revenue.



---

#### Get the avg min salary estimate by sector and location

In [17]:

avg_min_salary_by_sector = df.groupby(['sector','location'])[['min_salary_estimate']]\
                             .mean()\
                             .reset_index()\
                             .sort_values(by=['min_salary_estimate'], ascending=False)\
                             .round(2)
	

#print(avg_min_salary_by_sector.head(10)) # Top 10 of most paying sectors by sector and location

fig = px.bar(data_frame=avg_min_salary_by_sector.head(10).sort_values(by='sector', ascending=False),
              x = "sector",
              y = "min_salary_estimate",
              color="location",
              title="Average min salary by sector and location", 
              template=PLOTLY_THEME)
fig.update_layout(title_x=0.5)
fig.show()

## 1. High-Paying Sectors and Locations
- Finance stands out as the sector with the highest minimum salary estimates, particularly in Santa Clara, CA.
- Information Technology (IT), a major sector for data science roles, shows a strong salary offering, particularly in Redwood City, CA.
- Real Estate, Health Care, and Business Services also offer competitive salaries, indicating potential demand for data analytics and AI-driven decision-making.


## 2. Data Science Job Monetization Opportunity
- The Finance and IT sectors are particularly lucrative for data science professionals, making them prime targets for recruitment solutions.
- Santa Clara and Redwood City, CA, seem to be key locations offering high-paying jobs, meaning job postings from these locations could be prioritized for insights and monetization.

---

#### Demand for Data Science Roles by Company Size

In [18]:
demand_by_company_size = df.groupby(['industry', 'company_size_category'])[['job_title']]\
                      .count()\
                      .reset_index()\
                      .sort_values(by = "job_title", ascending=False)

demand_by_company_size.head(10)

Unnamed: 0,industry,company_size_category,job_title
20,Biotech & Pharmaceuticals,Large,173
105,IT Services,Small,171
103,IT Services,Large,160
95,Health Care Services & Hospitals,Large,151
115,Internet,Large,148
104,IT Services,Medium,140
37,Computer Hardware & Software,Large,131
179,Staffing & Outsourcing,Small,121
177,Staffing & Outsourcing,Large,104
39,Computer Hardware & Software,Small,102


In [19]:
#Salary Trends Based on Company Size

avg_min_salary_by_comany_size = df.groupby(['company_size_category'])[['max_salary_estimate']]\
                                  .median()\
                                  .reset_index()\
                                  .round(2)


avg_min_salary_by_comany_size

Unnamed: 0,company_size_category,max_salary_estimate
0,Large,136000.0
1,Medium,122000.0
2,Small,139000.0
3,Unknown,149000.0


In [20]:
fig = px.bar(data_frame=avg_min_salary_by_comany_size,
             x='company_size_category', 
             y='max_salary_estimate')
fig.show()

### How many Job Posting by Industry and Location?

In [21]:
roles_by_industry = df.groupby(['industry', 'company_size_category'])[['job_title']]\
                      .count()\
                      .reset_index()\
                      .sort_values(by = "job_title", ascending=False)

In [22]:
fig = px.bar(data_frame=roles_by_industry.head(10), x = "industry",
                                                    y = "job_title",
                                                    color = "company_size_category",
                                                    title="Number of Job Posting by location and industry", 
                                                    template=PLOTLY_THEME)
fig.update_layout(title_x=0.5)
fig.show()