## Post-block Assignment 1

In [1]:
!pip install -Uqq plotly
!pip install -Uqq nbformat

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
PLOTLY_THEME = "plotly_dark"

## 1. Read the data

In [3]:
def read_data(df):

    df = pd.read_csv(df)
    df.columns = df.columns.str.lower().str.replace(' ', '_') # Ensure all columns follow rules of declaring variables in coding
    df = df.drop(axis=1, columns=['unnamed:_0', 'index'])
    return df

df = read_data(df = "data/DataScientist (1).csv")
print(df.shape)
df.sample(5)

(3909, 15)


Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply
179,"Data Analyst, Bitcoin Trading Firm",$156K-$254K (Glassdoor est.),Our client is an innovative bitcoin marketplac...,-1.0,Fintech Recruiters,"New York, NY","Dublin, Ireland",Unknown,-1,Company - Private,-1,-1,Unknown / Non-Applicable,-1,-1
2013,ANALYST - SCIENTIST - KINETICS,$74K-$140K (Glassdoor est.),Join the Applied Power Division and work on di...,4.1,Southwest Research Institute\n4.1,"San Antonio, TX","San Antonio, TX",1001 to 5000 employees,1947,Nonprofit Organization,Research & Development,Business Services,$500 million to $1 billion (USD),"Los Alamos National Laboratory, Battelle, SRI ...",-1
3683,"Data Engineer Lead, Machine Learning",$39K-$71K (Glassdoor est.),BICP is partnered with an iconic retail client...,5.0,BICP\n5.0,"Fort Worth, TX","Carlsbad, CA",1 to 50 employees,2009,Company - Private,IT Services,Information Technology,$10 to $25 million (USD),-1,-1
30,Data Scientist,$120K-$140K (Glassdoor est.),Caserta is a best-in-class Data Analytics cons...,4.3,Caserta\n4.3,"New York, NY","New York, NY",51 to 200 employees,2001,Company - Private,IT Services,Information Technology,Unknown / Non-Applicable,-1,-1
2304,Sr. Data Analyst,$39K-$69K (Glassdoor est.),"At Housecall Pro, we're focused on making the ...",4.6,Housecall Pro\n4.6,"San Diego, CA","San Diego, CA",51 to 200 employees,2013,Company - Private,Computer Hardware & Software,Information Technology,Unknown / Non-Applicable,-1,-1


--- 

## 2. Data Preprocessing
- This entails data cleaning, extracting features, imputing missing values, remove noise etc

In [4]:
def clean_salary_estimates(df, column_name='salary_estimate'):
    
    df[column_name] = df[column_name].str.replace(r'\(.*\)', '', regex=True)  # Clean the salary_estimate column by removing extra text in parentheses,remove extra text

    # Remove '$' and 'K', then split by '-'
    df[['min_salary_estimate', 'max_salary_estimate']] = df[column_name].str.replace(r'[^\d-]', '', regex=True) \
                                                     .str.split('-', expand=True)
    # Replace 'K' with '000' to convert to actual values
    df['min_salary_estimate'] = df['min_salary_estimate'].str.replace('K', '').astype(int) * 1000
    df['max_salary_estimate'] = df['max_salary_estimate'].str.replace('K', '').astype(int) * 1000

    return df

In [5]:
def split_revenue(revenue):
    if "to" in revenue:
        return revenue.replace(" (USD)", "").split(" to ")
    elif "Less than" in revenue:
        return ["0", revenue.replace("Less than ", "").replace(" (USD)", "")]
    elif "Unknown" in revenue or revenue == "-1":
        return [None, None]
    elif "$10+ billion" in revenue:
        return ["$10 billion", None]  # Assuming $10+ billion is minimum at 10B
    else:
        return [None, None]

# Apply function to create min_revenue and max_revenue columns
df[["min_revenue", "max_revenue"]] = pd.DataFrame(df["revenue"].apply(split_revenue).tolist(), index=df.index)

In [6]:
def classify_size(size):
    """Classify company size based on employee count."""
    if isinstance(size, str):  # Ensure it's a string
        size = size.replace(" employees", "").strip()
        
        if size == "-1":
            return "Unknown"
        elif "10000+" in size:
            return "Large"
        elif " to " in size:
            min_size, max_size = map(int, size.split(" to "))
            if max_size <= 200:
                return "Small"
            elif max_size <= 1000:
                return "Medium"
            else:
                return "Large"
            
        elif size.isdigit():  # Single number case
            num = int(size)
            if num <= 200:
                return "Small"
            elif num <= 1000:
                return "Medium"
            else:
                return "Large"
    
    return "Unknown"

# Apply company size classification
df["company_size_category"] = df["size"].apply(classify_size)

In [7]:
df[['location_city', 'location_state']] = df['location'].str.split(',', expand=True)
df = df[~df['sector'].str.contains(r'\d', na=False)] # Remove numbers on feature sector
df = clean_salary_estimates(df)
df.sample(4)

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,...,revenue,competitors,easy_apply,min_revenue,max_revenue,company_size_category,location_city,location_state,min_salary_estimate,max_salary_estimate
1055,Quantitative Research Scientist,$90K-$175K,To support the continued growth of our large-s...,3.7,Quantlab\n3.7,"Houston, TX","Houston, TX",51 to 200 employees,1998,Company - Private,...,Unknown / Non-Applicable,-1,-1,,,Small,Houston,TX,90000,175000
2533,Data Scientist - NLP/Machine Learning,$134K-$210K,POSITION SUMMARY:\nPRIMARY DUTIES AND RESPONSI...,3.1,AmerisourceBergen\n3.1,"Carrollton, TX","Conshohocken, PA",10000+ employees,2001,Company - Public,...,$10+ billion (USD),"McKesson, Cardinal Health, Kinray",-1,$10 billion,,Large,Carrollton,TX,134000,210000
2060,Clinical Laboratory Scientist Team Lead,$83K-$154K,Description\n\nSHIFT: Weekends Only\n\nSCHEDUL...,3.4,Methodist Hospital Stone Oak\n3.4,"San Antonio, TX","San Antonio, TX",5001 to 10000 employees,1995,Company - Private,...,Unknown / Non-Applicable,-1,-1,,,Large,San Antonio,TX,83000,154000
2052,Sr. Scientist/Engineer,$83K-$154K,Are you ready to take the next step in your ca...,3.7,"Tetra Tech, Inc.\n3.7","San Antonio, TX","Pasadena, CA",10000+ employees,1966,Company - Public,...,$2 to $5 billion (USD),-1,-1,$2,$5 billion,Large,San Antonio,TX,83000,154000


---

<div style="text-align:center; font-size:30px; font-weight:bold;"> 3. Derive insights of strategic value</div>

In [8]:
roles_by_sectors = df.groupby(['sector'])[['job_title']]\
                     .count()\
                     .reset_index()\
                     .sort_values(by='job_title', ascending=False)\
                     .head(10)
                     
roles_by_sectors = roles_by_sectors.rename(columns={'job_title': 'num_of_job_posting'})

fig = px.bar(roles_by_sectors, x = "sector", y = "num_of_job_posting", template=PLOTLY_THEME, title="Number of Job Posting by Sector")
fig.update_layout(title_x = 0.5)
fig.show()

### High Demand for Data Science in IT and Business Services

- The **`Information Technology (IT)`** sector has the highest number of job postings. This suggests a strong demand for data science professionals in software development, AI, and cloud computing.

- **`Business Services`** is a Major Employer: The second-largest sector is Business Services, including consulting,advertising and marketing, staffing firms, which highlights outsourcing trends.

- Growing Demand in Biotech & Pharmaceuticals: With AI-driven healthcare analytics and drug discovery, data science hiring in Biotech & Pharma is becoming more prominent.

- Finance & Healthcare Are Expanding: Traditional industries such as Finance and Health Care have significant hiring activity, signaling an increasing reliance on data-driven decision-making in risk assessment, fraud detection, and patient care optimization.

- Emerging Sectors (Insurance, Education, Government, Media, Manufacturing): While these sectors have fewer postings compared to IT and Business Services, they represent growth areas where data-driven solutions are being integrated.

2. Strategic Recommendations for Investors & Recruitment Firms

#### For Investors:
- Investment in Data Science Talent Platforms: Given the high demand across industries, investing in AI-driven recruitment platforms tailored for data science roles can be lucrative.

- Targeted Training & Upskilling Platforms: The rise of data science in non-tech sectors (e.g., Healthcare, Finance, Manufacturing) presents an opportunity to invest in industry-specific data science training platforms.

#### For Recruitment Companies:
- `Niche Recruiting for High-Demand Sectors`: Since IT, Business Services, and Finance are major hirers, recruiters should build specialized pipelines for these industries.

#### Data Monetization Strategies

Strategy	Monetization Approach
| **Strategy**                            | **Monetization Approach**                                                                                   |
|-----------------------------------------|------------------------------------------------------------------------------------------------------------|
| **Recruitment Data Dashboards**         | Offer subscription-based dashboards for companies to track real-time hiring trends in data science          |
| **Industry-Specific Job Boards**        | Build custom job boards focused on Finance, Biotech, and Healthcare, sectors where specialized hiring is growing |
| **AI-Driven Candidate Matching**        | Develop a machine learning-based recruitment tool to match job seekers with the most suitable industry roles |
| **Corporate Workforce Planning Insights**| Sell industry-specific hiring forecasts to businesses so they can optimize workforce planning               |
| **Data Science Salary Benchmarking**    | Provide premium salary analytics for companies to benchmark compensation trends by industry                |
| **Hiring Strategy Consulting**          | Offer consulting packages to large companies and startups that need data science recruitment expertise      |










# References:
- https://www.cobloom.com/careers-blog/data-science-salaries-how-much-can-you-really-earn#:~:text=Some%20sectors%E2%80%94like%20healthcare%2C%20autonomous%20vehicles%2C,both%20pay%20and%20professional%20fulfillment



---

In [9]:
roles_by_company_size = df.groupby(['company_size_category'])[['job_title']]\
                          .count()\
                          .reset_index()\
                          
roles_by_company_size = roles_by_company_size.rename(columns={"job_title": "num_of_job_posting"})                          
                          
roles_by_company_size.head()

Unnamed: 0,company_size_category,num_of_job_posting
0,Large,1738
1,Medium,696
2,Small,914
3,Unknown,15


In [10]:
fig = px.pie(roles_by_company_size, values='num_of_job_posting', names="company_size_category", template=PLOTLY_THEME,title="Company Size Distribution")
fig.update_layout(title_x = 0.5)
fig.show()

#### For Investors:
- Invest in AI-powered Recruitment Solutions for Large Enterprises

- Given that large corporations dominate hiring (51.7%), investing in data-driven recruitment platforms (e.g., AI resume screening, automated job matching, and workforce analytics) can be highly lucrative.

- **`Support Recruitment Marketplaces for Startups & SMBs:`** Startups and small businesses (27.2%) need fast and cost-effective hiring solutions.

- Investing in niche job boards or AI-driven applicant tracking systems (ATS) focused on startups can generate subscription revenue.


#### For Recruitment Company:

### Large Companies: 

- Large firms require structured recruitment pipelines and talent retention strategies, this can be leverage by building a Machine Learning model to get the probablity of an employee to exit the company. 

- Recruitment agencies should offer enterprise-level consulting services, including workforce planning, salary benchmarking, and AI-driven hiring solutions.

### Small Companies: 

- Small companies and startups need quick hiring solutions but often struggle to attract top-tier data talent.

- Recruitment firms can monetize this by creating targeted talent pools, offering AI-driven job matching, or running specialized job fairs.

### Mid-Sized Firms: 

- Medium-sized companies need cost-effective hiring solutions but are less structured than large firms.
- Offering AI-driven hiring analytics and recruitment process outsourcing (RPO) services can help them compete with larger employers.
 
---

### Location Based Insights

In [11]:
roles_by_location_state = df.groupby(['location_state'])['job_title']\
                            .count()\
                            .reset_index()\
                            .sort_values(by='job_title', ascending=False)

roles_by_location_state = roles_by_location_state.rename(columns={"job_title": "num_of_jobs_by_state"})

fig = px.bar(data_frame=roles_by_location_state, x="location_state", y="num_of_jobs_by_state", template=PLOTLY_THEME, title="Job Posting By State")
fig.update_layout(xaxis_title="Location State",
                  yaxis_title="Numer of Job Posting",
                  title_x = 0.5)
fig.show()

#### Geographic Demand Concentration:

- Texas (TX) and California (CA) have the highest number of job postings, suggesting that these states are key markets for data science talent.

- Other states like Illinois (IL), Pennsylvania (PA), New York (NY), and Arizona (AZ) also show moderate demand, but significantly lower than TX and CA.

- There are a few international postings (e.g., United Kingdom), indicating some global hiring trends.

#### Market Gaps and Expansion Potential:

- States like Florida (FL), New Jersey (NJ), and Delaware (DE) have minimal job postings, suggesting either a lack of demand or untapped potential.

- This insight can be used to predict where data science roles might expand in the future, based on industry growth trends.

Monetize through certifications, bootcamps, and employer-sponsored training courses.

----

In [12]:
# demand_by_company_size = df.groupby(['industry', 'company_size_category'])[['job_title']]\
#                       .count()\
#                       .reset_index()\
#                       .sort_values(by = "job_title", ascending=False)

# demand_by_company_size.head(10)

In [13]:
# #Salary Trends Based on Company Size

# avg_min_salary_by_comany_size = df.groupby(['company_size_category'])[['max_salary_estimate']]\
#                                   .median()\
#                                   .reset_index()\
#                                   .round(2)


# avg_min_salary_by_comany_size

### How many Job Posting by Industry and Location?

In [14]:
# roles_by_industry = df.groupby(['industry', 'company_size_category'])[['job_title']]\
#                       .count()\
#                       .reset_index()\
#                       .sort_values(by = "job_title", ascending=False)


# fig = px.bar(data_frame=roles_by_industry.head(10), x = "industry",
#                                                     y = "job_title",
#                                                     color = "company_size_category",
#                                                     title="Number of Job Posting by location and industry", 
#                                                     template=PLOTLY_THEME)
# fig.update_layout(title_x=0.5)
# fig.show()