## Post-block Assignment 1

In [1]:
# !pip install -Uqq plotly

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
PLOTLY_THEME = "plotly_dark"

## 1. Read the data

In [3]:
def read_data(df):

    df = pd.read_csv(df)
    df.columns = df.columns.str.lower().str.replace(' ', '_') # Ensure all columns follow rules of declaring variables in coding
    df = df.drop(axis=1, columns=['unnamed:_0', 'index'])
    return df

df = read_data(df = "data/DataScientist (1).csv")
print(df.shape)
df.sample(5)

(3909, 15)


Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply
1579,Lead Data Engineer,$84K-$101K (Glassdoor est.),Job Title\r\nLead Data Engineer - Payment Serv...,3.8,Diversant LLC\r\n3.8,"Glendale, AZ","Red Bank, NJ",1001 to 5000 employees,2005,Company - Private,Staffing & Outsourcing,Business Services,$100 to $500 million (USD),"Kforce, Mitchell Martin, Insight Global",-1
2781,Sr. Data Modeler (Open API),$46K-$91K (Glassdoor est.),"As a senior-level member of technical staff, y...",3.7,Cincinnati Bell Technology Solutions\r\n3.7,"Irving, TX","Cincinnati, OH",501 to 1000 employees,-1,Company - Private,IT Services,Information Technology,$100 to $500 million (USD),-1,-1
1068,Data Scientist - Relocate to Washington D.C.,$90K-$175K (Glassdoor est.),Please note that this role requires relocation...,3.8,CIA\r\n3.8,"Houston, TX","Langley, VA",Unknown,1947,Government,Federal Agencies,Government,Unknown / Non-Applicable,-1,-1
3792,Machine Learning Engineer,$52K-$94K (Glassdoor est.),Tiger Analytics is an advanced analytics consu...,4.6,Tiger Analytics\r\n4.6,"Columbus, OH","Santa Clara, CA",201 to 500 employees,2011,Company - Private,Consulting,Business Services,$10 to $25 million (USD),"Mu Sigma, LatentView Analytics, Fractal",-1
928,Data Analyst,$62K-$109K (Glassdoor est.),Job Description\r\nWe are seeking a resource w...,-1.0,TalentDash,"Chicago, IL","Singapore, Singapore",1 to 50 employees,2014,Company - Private,Enterprise Software & Network Solutions,Information Technology,Unknown / Non-Applicable,-1,-1


--- 

## 2. Data Preprocessing
- This entails data cleaning, extracting features, imputing missing values, remove noise etc

In [4]:
def clean_salary_estimates(df, column_name='salary_estimate'):
    
    df[column_name] = df[column_name].str.replace(r'\(.*\)', '', regex=True)  # Clean the salary_estimate column by removing extra text in parentheses,remove extra text

    # Remove '$' and 'K', then split by '-'
    df[['min_salary_estimate', 'max_salary_estimate']] = df[column_name].str.replace(r'[^\d-]', '', regex=True) \
                                                     .str.split('-', expand=True)
    # Replace 'K' with '000' to convert to actual values
    df['min_salary_estimate'] = df['min_salary_estimate'].str.replace('K', '').astype(int) * 1000
    df['max_salary_estimate'] = df['max_salary_estimate'].str.replace('K', '').astype(int) * 1000

    return df

In [5]:
def split_revenue(revenue):
    if "to" in revenue:
        return revenue.replace(" (USD)", "").split(" to ")
    elif "Less than" in revenue:
        return ["0", revenue.replace("Less than ", "").replace(" (USD)", "")]
    elif "Unknown" in revenue or revenue == "-1":
        return [None, None]
    elif "$10+ billion" in revenue:
        return ["$10 billion", None]  # Assuming $10+ billion is minimum at 10B
    else:
        return [None, None]

# Apply function to create min_revenue and max_revenue columns
df[["min_revenue", "max_revenue"]] = pd.DataFrame(df["revenue"].apply(split_revenue).tolist(), index=df.index)

In [6]:
def classify_size(size):
    """Classify company size based on employee count."""
    if isinstance(size, str):  # Ensure it's a string
        size = size.replace(" employees", "").strip()
        
        if size == "-1":
            return "Unknown"
        elif "10000+" in size:
            return "Large"
        elif " to " in size:
            min_size, max_size = map(int, size.split(" to "))
            if max_size <= 200:
                return "Small"
            elif max_size <= 1000:
                return "Medium"
            else:
                return "Large"
            
        elif size.isdigit():  # Single number case
            num = int(size)
            if num <= 200:
                return "Small"
            elif num <= 1000:
                return "Medium"
            else:
                return "Large"
    
    return "Unknown"

# Apply company size classification
df["company_size_category"] = df["size"].apply(classify_size)

In [7]:
df = df[~df['sector'].str.contains(r'\d', na=False)] # Remove numbers on feature sector
df = clean_salary_estimates(df)
df.sample(4)

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply,min_revenue,max_revenue,company_size_category,min_salary_estimate,max_salary_estimate
3499,Scientist III (Molecular Biology),$50K-$110K,"Scientist III (Molecular Biology)\r\nAustin, T...",3.1,"Integrated Resources, Inc\r\n3.1","Austin, TX","Edison, NJ",201 to 500 employees,1996,Company - Private,Staffing & Outsourcing,Business Services,$25 to $50 million (USD),-1,-1,$25,$50 million,Medium,50000,110000
324,Senior Data Scientist,$96K-$119K,Job Description\r\nTitle: Senior Data Scientis...,3.7,Sunrise Systems Inc\r\n3.7,"Newark, NJ","Edison, NJ",501 to 1000 employees,1990,Company - Private,IT Services,Information Technology,$25 to $50 million (USD),-1,-1,$25,$50 million,Medium,96000,119000
3683,"Data Engineer Lead, Machine Learning",$39K-$71K,BICP is partnered with an iconic retail client...,5.0,BICP\r\n5.0,"Fort Worth, TX","Carlsbad, CA",1 to 50 employees,2009,Company - Private,IT Services,Information Technology,$10 to $25 million (USD),-1,-1,$10,$25 million,Small,39000,71000
3849,Data Science Practitioner,$55K-$113K,M LEVEL 9 OTHER The Data Science Specialist (R...,2.5,"Avacend, Inc.\r\n2.5","Columbus, OH","Alpharetta, GA",51 to 200 employees,-1,Company - Private,Staffing & Outsourcing,Business Services,Unknown / Non-Applicable,-1,-1,,,Small,55000,113000


---

<div style="text-align:center; font-size:30px; font-weight:bold;"> Derive insights of strategic value</div>

## Salary By Sector

In [8]:
fig = px.box(data_frame=df, x = "sector", y = "min_salary_estimate")
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

### Number of job posting by location

In [None]:
roles_by_sectors = df.groupby(['sector'])[['job_title']]\
                     .count()\
                     .reset_index()\
                     .sort_values(by='job_title', ascending=False)\
                     .head(10)
                     
roles_by_sectors = roles_by_sectors.rename(columns={'job_title': 'num_of_job_posting'})

fig = px.bar(roles_by_sectors, x = "sector", y = "num_of_job_posting", template=PLOTLY_THEME, title="Number of Job Posting by Sector")
fig.update_layout(title_x = 0.5)
fig.show()

In [10]:
df.head()
df[df.sector  == "Business Services"][["sector", "industry"]]
 


Unnamed: 0,sector,industry
4,Business Services,Advertising & Marketing
15,Business Services,Advertising & Marketing
17,Business Services,Consulting
19,Business Services,Research & Development
24,Business Services,Advertising & Marketing
...,...,...
3881,Business Services,Staffing & Outsourcing
3888,Business Services,Staffing & Outsourcing
3899,Business Services,Staffing & Outsourcing
3900,Business Services,Staffing & Outsourcing


1. High Demand for Data Science in IT and Business Services

- The **`Information Technology (IT)`** sector has the highest number of job postings. This suggests a strong demand for data science professionals in software development, AI, and cloud computing.

- **`Business Services`** is a Major Employer: The second-largest sector is Business Services, including consulting,advertising and marketing, staffing firms, which highlights outsourcing trends.

- Growing Demand in Biotech & Pharmaceuticals: With AI-driven healthcare analytics and drug discovery, data science hiring in Biotech & Pharma is becoming more prominent.

- Finance & Healthcare Are Expanding: Traditional industries such as Finance and Health Care have significant hiring activity, signaling an increasing reliance on data-driven decision-making in risk assessment, fraud detection, and patient care optimization.

- Emerging Sectors (Insurance, Education, Government, Media, Manufacturing): While these sectors have fewer postings compared to IT and Business Services, they represent growth areas where data-driven solutions are being integrated.

2. Strategic Recommendations for Investors & Recruitment Firms

#### For Investors:
- Investment in Data Science Talent Platforms: Given the high demand across industries, investing in AI-driven recruitment platforms tailored for data science roles can be lucrative.

- Targeted Training & Upskilling Platforms: The rise of data science in non-tech sectors (e.g., Healthcare, Finance, Manufacturing) presents an opportunity to invest in industry-specific data science training platforms.

#### For Recruitment Companies:
- `Niche Recruiting for High-Demand Sectors`: Since IT, Business Services, and Finance are major hirers, recruiters should build specialized pipelines for these industries.

#### Data Monetization Strategies

Strategy	Monetization Approach
Recruitment Data Dashboards	Offer subscription-based dashboards for companies to track real-time hiring trends in data science
Industry-Specific Job Boards	Build custom job boards focused on Finance, Biotech, and Healthcare, sectors where specialized hiring is growing
AI-Driven Candidate Matching	Develop a machine learning-based recruitment tool to match job seekers with the most suitable industry roles
Corporate Workforce Planning Insights	Sell industry-specific hiring forecasts to businesses so they can optimize workforce planning
Data Science Salary Benchmarking	Provide premium salary analytics for companies to benchmark compensation trends by industry
Hiring Strategy Consulting	Offer consulting packages to large companies and startups that need data science recruitment expertise









# References: https://www.cobloom.com/careers-blog/data-science-salaries-how-much-can-you-really-earn#:~:text=Some%20sectors%E2%80%94like%20healthcare%2C%20autonomous%20vehicles%2C,both%20pay%20and%20professional%20fulfillment


---

In [None]:
roles_by_company_size = df.groupby(['company_size_category'])[['job_title']]\
                          .count()\
                          .reset_index()\
                          
roles_by_company_size = roles_by_company_size.rename(columns={"job_title": "num_of_job_posting"})                          
                          
roles_by_company_size.head()

In [None]:
fig = px.pie(roles_by_company_size, values='num_of_job_posting', names="company_size_category", template=PLOTLY_THEME)
fig.show()

##### Provide insights
---

#### Get the avg min salary estimate by sector and location

In [None]:

avg_min_salary_by_sector = df.groupby(['sector','location'])[['min_salary_estimate']]\
                             .mean()\
                             .reset_index()\
                             .sort_values(by=['min_salary_estimate'], ascending=False)\
                             .round(2)
	

#print(avg_min_salary_by_sector.head(10)) # Top 10 of most paying sectors by sector and location

fig = px.bar(data_frame=avg_min_salary_by_sector.head(10).sort_values(by='sector', ascending=False),
              x = "sector",
              y = "min_salary_estimate",
              color="location",
              title="Average min salary by sector and location", 
              template=PLOTLY_THEME)
fig.update_layout(title_x=0.5)
fig.show()

## 1. High-Paying Sectors and Locations
- Finance stands out as the sector with the highest minimum salary estimates, particularly in Santa Clara, CA.
- Information Technology (IT), a major sector for data science roles, shows a strong salary offering, particularly in Redwood City, CA.
- Real Estate, Health Care, and Business Services also offer competitive salaries, indicating potential demand for data analytics and AI-driven decision-making.


## 2. Data Science Job Monetization Opportunity
- The Finance and IT sectors are particularly lucrative for data science professionals, making them prime targets for recruitment solutions.
- Santa Clara and Redwood City, CA, seem to be key locations offering high-paying jobs, meaning job postings from these locations could be prioritized for insights and monetization.

---

#### Demand for Data Science Roles by Company Size

In [None]:
demand_by_company_size = df.groupby(['industry', 'company_size_category'])[['job_title']]\
                      .count()\
                      .reset_index()\
                      .sort_values(by = "job_title", ascending=False)

demand_by_company_size.head(10)

In [None]:
#Salary Trends Based on Company Size

avg_min_salary_by_comany_size = df.groupby(['company_size_category'])[['max_salary_estimate']]\
                                  .median()\
                                  .reset_index()\
                                  .round(2)


avg_min_salary_by_comany_size

In [None]:
fig = px.bar(data_frame=avg_min_salary_by_comany_size,
             x='company_size_category', 
             y='max_salary_estimate')
fig.show()

### How many Job Posting by Industry and Location?

In [None]:
roles_by_industry = df.groupby(['industry', 'company_size_category'])[['job_title']]\
                      .count()\
                      .reset_index()\
                      .sort_values(by = "job_title", ascending=False)

In [None]:
fig = px.bar(data_frame=roles_by_industry.head(10), x = "industry",
                                                    y = "job_title",
                                                    color = "company_size_category",
                                                    title="Number of Job Posting by location and industry", 
                                                    template=PLOTLY_THEME)
fig.update_layout(title_x=0.5)
fig.show()