## Post-block Assignment 1

In [None]:
# !pip install -Uqq plotly

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
PLOTLY_THEME = "plotly_dark"

## 1. Read the data

In [2]:
def read_data(df):

    df = pd.read_csv(df)
    df.columns = df.columns.str.lower().str.replace(' ', '_') # Ensure all columns follow rules of declaring variables in coding
    df = df.drop(axis=1, columns=['unnamed:_0', 'index'])
    return df

df = read_data(df = "data/DataScientist (1).csv")
print(df.shape)
df.sample(5)

(3909, 15)


Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply
2285,Data Engineer,$71K-$122K (Glassdoor est.),"ABOUT CUREMETRIX\n\nDelivering CAD that Works,...",4.0,Curemetrix\n4.0,"San Diego, CA","La Jolla, CA",1 to 50 employees,-1,Company - Private,-1,-1,Unknown / Non-Applicable,-1,-1
1484,Data Engineer,$47K-$73K (Glassdoor est.),Position: Data Engineer\nLocation: currently r...,4.3,PeopleNTech LLC\n4.3,"Chandler, AZ","Vienna, VA",1 to 50 employees,-1,Contract,IT Services,Information Technology,$1 to $5 million (USD),-1,-1
623,"Senior Research Scientist I, Data Analytics",$37K-$75K (Glassdoor est.),Kite is continuing to hire for all open roles....,3.4,Kite Pharma\n3.4,"Santa Monica, CA","Santa Monica, CA",1001 to 5000 employees,2008,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$10 to $25 million (USD),-1,-1
3336,Machine Learning Engineer (Imaging),$38K-$68K (Glassdoor est.),Position SummarySamsung Austin Semiconductor i...,2.9,Samsung Electronics America Inc\n2.9,"Austin, TX","Ridgefield Park, NJ",1001 to 5000 employees,1938,Subsidiary or Business Segment,Computer Hardware & Software,Information Technology,$10+ billion (USD),-1,-1
1422,Data Engineer,$92K-$144K (Glassdoor est.),Candidate Description Blue Rose Technologies (...,3.2,Blue Rose Technologies LLC\n3.2,"Tempe, AZ","Bengaluru, India",201 to 500 employees,2010,Company - Private,IT Services,Information Technology,Unknown / Non-Applicable,-1,-1


--- 

## 2. Data Preprocessing
- This entails data cleaning, extracting features, imputing missing values, remove noise etc

In [3]:
def clean_salary_estimates(df, column_name='salary_estimate'):
    
    df[column_name] = df[column_name].str.replace(r'\(.*\)', '', regex=True)  # Clean the salary_estimate column by removing extra text in parentheses,remove extra text

    # Remove '$' and 'K', then split by '-'
    df[['min_salary_estimate', 'max_salary_estimate']] = df[column_name].str.replace(r'[^\d-]', '', regex=True) \
                                                     .str.split('-', expand=True)
    # Replace 'K' with '000' to convert to actual values
    df['min_salary_estimate'] = df['min_salary_estimate'].str.replace('K', '').astype(int) * 1000
    df['max_salary_estimate'] = df['max_salary_estimate'].str.replace('K', '').astype(int) * 1000

    return df

In [4]:
def split_revenue(revenue):
    if "to" in revenue:
        return revenue.replace(" (USD)", "").split(" to ")
    elif "Less than" in revenue:
        return ["0", revenue.replace("Less than ", "").replace(" (USD)", "")]
    elif "Unknown" in revenue or revenue == "-1":
        return [None, None]
    elif "$10+ billion" in revenue:
        return ["$10 billion", None]  # Assuming $10+ billion is minimum at 10B
    else:
        return [None, None]

# Apply function to create min_revenue and max_revenue columns
df[["min_revenue", "max_revenue"]] = pd.DataFrame(df["revenue"].apply(split_revenue).tolist(), index=df.index)

In [5]:
df[["job_title","revenue","min_revenue","max_revenue"]].sample(10)

Unnamed: 0,job_title,revenue,min_revenue,max_revenue
3625,Senior Business Intelligence Analyst,$10 to $25 million (USD),$10,$25 million
585,Billingual Chinese - Data Analyst,Unknown / Non-Applicable,,
3095,Investigational Computational Biologist,Unknown / Non-Applicable,,
2215,Senior Data Scientist,$1 to $5 million (USD),$1,$5 million
450,Research Scientist,$2 to $5 billion (USD),$2,$5 billion
3018,Machine Learning Engineer,Unknown / Non-Applicable,,
3401,Data Engineer,Unknown / Non-Applicable,,
1011,Senior Data Analyst,$1 to $2 billion (USD),$1,$2 billion
259,Data Analyst,Less than $1 million (USD),0,$1 million
2205,Research Scientist,-1,,


In [6]:
df = df[~df['sector'].str.contains(r'\d', na=False)] # Remove numbers on feature sector
df = clean_salary_estimates(df)
df.sample(4)

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply,min_revenue,max_revenue,min_salary_estimate,max_salary_estimate
483,Data Engineer,$76K-$125K,Title: Data Engineer\nJob ID: TJ3682741021\nLo...,4.7,TEEMA\n4.7,"Los Angeles, CA","Litchfield Park, AZ",201 to 500 employees,2008,Company - Private,Staffing & Outsourcing,Business Services,$50 to $100 million (USD),-1,-1,$50,$100 million,76000,125000
3742,Data Analyst,$100K-$163K,Olive is healthcare’s first intelligent digita...,3.5,Olive\n3.5,"Columbus, OH","Columbus, OH",201 to 500 employees,2012,Company - Private,Enterprise Software & Network Solutions,Information Technology,Unknown / Non-Applicable,-1,-1,,,100000,163000
202,Lead Data Scientist,$119K-$147K,"At Rockstar Games, we create the games we woul...",4.1,Rockstar Games\n4.1,"New York, NY","New York, NY",1001 to 5000 employees,1998,Subsidiary or Business Segment,Video Games,Media,$10 to $25 million (USD),-1,-1,$10,$25 million,119000,147000
2092,Epidemiologist - Data Analyst,$52K-$91K,Instructs all levels of Enterprise-Wide inform...,4.0,"CICONIX, LLC\n4.0","San Antonio, TX","Annapolis, MD",1 to 50 employees,-1,Company - Private,Federal Agencies,Government,Unknown / Non-Applicable,-1,-1,,,52000,91000


---

<div style="text-align:center; font-size:30px; font-weight:bold;"> Derive insights of strategic value</div>

#### Get the avg min salary estimate by sector and location

In [7]:

avg_min_salary_by_sector = df.groupby(['sector','location'])[['min_salary_estimate']]\
                             .mean()\
                             .reset_index()\
                             .sort_values(by=['min_salary_estimate'], ascending=False)\
                             .round(2)
	

#print(avg_min_salary_by_sector.head(10)) # Top 10 of most paying sectors by sector and location

fig = px.bar(data_frame=avg_min_salary_by_sector.head(10).sort_values(by='sector', ascending=False),
              x = "sector",
              y = "min_salary_estimate",
              color="location",
              title="Average min salary by sector and location", 
              template=PLOTLY_THEME)
fig.update_layout(title_x=0.5)
fig.show()

## 1. High-Paying Sectors and Locations
- Finance stands out as the sector with the highest minimum salary estimates, particularly in Santa Clara, CA.
- Information Technology (IT), a major sector for data science roles, shows a strong salary offering, particularly in Redwood City, CA.
- Real Estate, Health Care, and Business Services also offer competitive salaries, indicating potential demand for data analytics and AI-driven decision-making.


## 2. Data Science Job Monetization Opportunity
- The Finance and IT sectors are particularly lucrative for data science professionals, making them prime targets for recruitment solutions.
- Santa Clara and Redwood City, CA, seem to be key locations offering high-paying jobs, meaning job postings from these locations could be prioritized for insights and monetization.

---

### Company Size Analysis

In [8]:
def classify_size(size):
    """Classify company size based on employee count."""
    if isinstance(size, str):  # Ensure it's a string
        size = size.replace(" employees", "").strip()
        
        if size == "-1":
            return "Unknown"
        elif "10000+" in size:
            return "Large"
        elif " to " in size:
            min_size, max_size = map(int, size.split(" to "))
            if max_size <= 200:
                return "Small"
            elif max_size <= 1000:
                return "Medium"
            else:
                return "Large"
            
        elif size.isdigit():  # Single number case
            num = int(size)
            if num <= 200:
                return "Small"
            elif num <= 1000:
                return "Medium"
            else:
                return "Large"
    
    return "Unknown"

# Apply company size classification
df["company_size_category"] = df["size"].apply(classify_size)


In [9]:
df[["size", "company_size_category"]].sample(10)

Unnamed: 0,size,company_size_category
961,501 to 1000 employees,Medium
3256,1 to 50 employees,Small
1986,1 to 50 employees,Small
3537,10000+ employees,Large
1453,10000+ employees,Large
2994,1 to 50 employees,Small
1941,10000+ employees,Large
1992,501 to 1000 employees,Medium
1197,201 to 500 employees,Medium
2661,201 to 500 employees,Medium


#### Demand for Data Science Roles by Company Size

In [None]:
demand_by_company_size = df.groupby(['industry', 'company_size_category'])[['job_title']]\
                      .count()\
                      .reset_index()\
                      .sort_values(by = "job_title", ascending=False)

demand_by_company_size.head(10)

In [None]:
#Salary Trends Based on Company Size

avg_min_salary_by_comany_size = df.groupby(['company_size_category'])[['max_salary_estimate']]\
                                  .median()\
                                  .reset_index()\
                                  .round(2)


avg_min_salary_by_comany_size

In [None]:
fig = px.bar(data_frame=avg_min_salary_by_comany_size,
             x='company_size_category', 
             y='max_salary_estimate')
fig.show()

### How many Job Posting by Industry and Location?

In [None]:
roles_by_industry = df.groupby(['industry', 'company_size_category'])[['job_title']]\
                      .count()\
                      .reset_index()\
                      .sort_values(by = "job_title", ascending=False)

In [None]:
fig = px.bar(data_frame=roles_by_industry.head(10), x = "industry",
                                                    y = "job_title",
                                                    color = "company_size_category",
                                                    title="Number of Job Posting by location and industry", 
                                                    template=PLOTLY_THEME)
fig.update_layout(title_x=0.5)
fig.show()