In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from plotly import express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
df=pd.read_csv('Data Science Jobs preprocessed5000.csv')
df.drop("Unnamed: 0",axis=1,inplace=True)
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,...,Machine Learning,SQL,Computer Science,Deep Learning,Big Data/Spark,Visualization Tool,Data Science,Experience,Grade Required,Company Age
0,Data Analytics,$38.00 - $45.00 Per Hour (Employer est.),Minimum three years experience in data science...,3.9,J & S Consulting,"Tucker, GA",51 to 200 Employees,--,Company - Private,Information Technology Support Services,...,0,1,0,0,0,0,1,3,0,-1
1,Data Scientist,$120K - $130K (Employer est.),Data science: 1 year (Preferred). Expertise in...,4.0,element technologies,Remote,51 to 200 Employees,2000,Company - Private,Information Technology Support Services,...,1,0,0,0,0,0,1,1,0,24
2,Data Scientist,$66.82 - $80.54 Per Hour (Employer est.),End-to-end data and model pipeline deployment ...,4.4,Boston Technology,"Malvern, PA",51 to 200 Employees,2004,Company - Private,Information Technology Support Services,...,1,0,0,0,1,0,0,-1,0,20
3,Data Engineer,$105K - $160K (Employer est.),Communicates technical concepts to non-technic...,3.9,Costco Wholesale,"Dallas, TX",10000+ Employees,1976,Company - Public,General Merchandise & Superstores,...,1,0,0,0,0,0,0,-1,0,48
4,Data Science Co-Op,$84K - $109K (Glassdoor est.),Work with source control tools like GIT to sav...,4.1,Hunter Engineering Company,"Bridgeton, MO",1001 to 5000 Employees,1946,Company - Private,Machinery Manufacturing,...,1,0,1,0,0,0,0,-1,1,78


In [4]:
df.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Size', 'Founded', 'Type of ownership',
       'Industry', 'Sector', 'Revenue', 'URL', 'Glassdoor Estimate',
       'Employeer Estimate', 'Per Hour', 'Minimum Salary', 'Maximum Salary',
       'Average Salary', 'State', 'Refined Job Title', 'Seniority', 'Analysis',
       'Machine Learning', 'SQL', 'Computer Science', 'Deep Learning',
       'Big Data/Spark', 'Visualization Tool', 'Data Science', 'Experience',
       'Grade Required', 'Company Age'],
      dtype='object')

In [5]:
df.drop(["Job Title","Salary Estimate","Job Description","Location","Founded","URL"],axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,Rating,Company Name,Size,Type of ownership,Industry,Sector,Revenue,Glassdoor Estimate,Employeer Estimate,Per Hour,...,Machine Learning,SQL,Computer Science,Deep Learning,Big Data/Spark,Visualization Tool,Data Science,Experience,Grade Required,Company Age
0,3.9,J & S Consulting,51 to 200 Employees,Company - Private,Information Technology Support Services,Information Technology,$5 to $25 million (USD),0,1,1,...,0,1,0,0,0,0,1,3,0,-1
1,4.0,element technologies,51 to 200 Employees,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable,0,1,0,...,1,0,0,0,0,0,1,1,0,24
2,4.4,Boston Technology,51 to 200 Employees,Company - Private,Information Technology Support Services,Information Technology,$5 to $25 million (USD),0,1,1,...,1,0,0,0,1,0,0,-1,0,20
3,3.9,Costco Wholesale,10000+ Employees,Company - Public,General Merchandise & Superstores,Retail & Wholesale,Unknown / Non-Applicable,0,1,0,...,1,0,0,0,0,0,0,-1,0,48
4,4.1,Hunter Engineering Company,1001 to 5000 Employees,Company - Private,Machinery Manufacturing,Manufacturing,$500 million to $1 billion (USD),1,0,0,...,1,0,1,0,0,0,0,-1,1,78


# Salary And Job

In [7]:
min_salary = df.groupby("Refined Job Title")["Minimum Salary"].mean().sort_values(ascending=False)
avg_salary = df.groupby("Refined Job Title")["Average Salary"].mean().sort_values(ascending=False)
max_salary = df.groupby("Refined Job Title")["Maximum Salary"].mean().sort_values(ascending=False)

fig = make_subplots(rows=1, cols=3, subplot_titles=("Starting Salaries", "Average Salaries", "Maximum Salaries Being Offered"))
fig.add_trace(
    go.Bar(x=min_salary.index, y=min_salary.values, marker_color="blue", name="Starting Salaries"),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=avg_salary.index, y=avg_salary.values, marker_color="green", name="Average Salaries"),
    row=1, col=2
)

fig.add_trace(
    go.Bar(x=max_salary.index, y=max_salary.values, marker_color="red", name="Maximum Salaries Being Offered"),
    row=1, col=3
)
fig.update_layout(
    height=500,
    width=1200,
    showlegend=False,
    title_text="Salary Comparisons by Job Title",
    title_x=0.5
)
fig.update_xaxes(tickangle=45)
fig.show()

# Salary And State 

In [8]:
state_avg_salary = df.groupby("State")["Average Salary"].mean().sort_values(ascending=False)
fig = go.Figure(
    data=[
        go.Bar(
            x=state_avg_salary.index,
            y=state_avg_salary.values,
            marker_color="blue",
            name="Average Salary"
        )
    ]
)
fig.update_layout(
    title="Average Salary by State",
    xaxis_title="State",
    yaxis_title="Average Salary",
    height=500,
    width=800,
    title_x=0.5
)
fig.update_xaxes(tickangle=45)
fig.show()

# Number of Jobs and States

In [14]:
temp = df[df["State"] != "United States"]
grouped_data = temp.groupby("State")["Average Salary"].count().sort_values(ascending=False)
fig = go.Figure(
    data=[
        go.Scatter(
            x=grouped_data.index,
            y=grouped_data.values,
            mode="lines+markers",
            line=dict(color="black"),
            marker=dict(symbol="circle"),
            name="Job Rate"
        )
    ]
)
fig.update_layout(
    title="Job Rate",
    xaxis_title="State",
    yaxis_title="Count",
    height=500,
    width=1000,
    title_x=0.5
)
fig.update_xaxes(tickangle=90)
fig.show()

# Experience And Salary

In [10]:
temp = df[df["Experience"] != -1]
grouped_data = temp.groupby("Experience")["Average Salary"].mean()
fig = go.Figure(
    data=[
        go.Bar(
            x=grouped_data.index,
            y=grouped_data.values,
            marker_color="blue",
            marker_line_color="black",
            marker_line_width=1
        )
    ]
)
fig.update_layout(
    title="Average Salary by Experience",
    xaxis_title="Experience",
    yaxis_title="Average Salary",
    height=500,
    width=800,
    title_x=0.5
)
fig.show()


# Grade demand??

In [11]:
NoGrade = len(df[df["Grade Required"] == 0])
Grade = len(df) - NoGrade
fig = go.Figure(
    data=[
        go.Pie(
            labels=["No Grade", "Grade Required"],
            values=[NoGrade, Grade],
            textinfo="label+percent",  # Displays labels and percentages
        )
    ]
)
fig.update_layout(
    title="Distribution of Grade Required vs No Grade for Job",
    title_x=0.5
)
fig.show()

# Age and Experience

In [19]:
import numpy as np
temp = df[(df["Experience"] != -1) & (df["Company Age"] != -1)]
hist, x_edges, y_edges = np.histogram2d(temp["Experience"], temp["Company Age"], bins=10)

fig = go.Figure(
    data=go.Heatmap(
        z=hist.T,
        x=x_edges,
        y=y_edges,
        colorscale="Blues",
        colorbar=dict(title="Count")
    )
)

fig.update_layout(
    height=600,
    width=600,
    title="2D Histogram of Experience vs Company Age",
    xaxis_title="Experience (Years)",
    yaxis_title="Company Age (Years)",
    title_x=0.5
)

fig.show()


# Age and Salary

In [24]:
temp = df[df["Company Age"] != -1]
bins = 10
temp['Company Age Binned'] = pd.cut(temp['Company Age'], bins=bins)
grouped_data = temp.groupby('Company Age Binned')['Average Salary'].mean()
fig = go.Figure(
    data=[
        go.Bar(
            x=grouped_data.index.astype(str),
            y=grouped_data.values,
            marker_color="gray"
        )
    ]
)

fig.update_layout(
    title="Average Salary by Binned Company Age",
    xaxis_title="Company Age (Binned)",
    yaxis_title="Average Salary",
    title_x=0.5,
    height=600,
    width=700,
    xaxis_tickangle=90,
    xaxis_tickfont_size=8
)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





# Seniority and Salary

In [25]:
temp = df[df["Seniority"] != "Not Applicable"]
grouped_data = temp.groupby("Seniority")["Average Salary"].mean().sort_values()

fig = go.Figure(
    data=[
        go.Bar(
            x=grouped_data.index,
            y=grouped_data.values,
            marker_color="blue"
        )
    ]
)
fig.update_layout(
    title="Seniority and Salary",
    xaxis_title="Seniority",
    yaxis_title="Average Salary",
    height=600,
    width=700,
    title_x=0.5
)
fig.show()

# Seniority and Size

In [None]:
df["Size"].unique()

array(['51 to 200 Employees', '10000+ Employees',
       '1001 to 5000 Employees', '1 to 50 Employees', 'Unknown',
       '501 to 1000 Employees', '201 to 500 Employees', '-1',
       '5001 to 10000 Employees'], dtype=object)

In [30]:
temp = df[(df["Seniority"] != "Not Applicable") & (df["Seniority"] != "Not Unknown") & (df["Seniority"] != "-1")& (df["Size"] != "-1")]
counts = pd.crosstab(temp['Seniority'], temp['Size'])

fig = go.Figure(
    data=go.Heatmap(
        z=counts.values,
        x=counts.columns,
        y=counts.index,
        colorscale='Blues',
        colorbar=dict(title='Count'),
        showscale=True
    )
)

fig.update_layout(
    title="Distribution of Seniority vs Company Size",
    xaxis_title="Company Size",
    yaxis_title="Seniority",
    height=500,
    width=700,
    title_x=0.5,
    xaxis=dict(tickangle=45)
)
fig.show()


# Skills

In [31]:
top_skills=pd.read_csv('top_skills.csv')#already done
fig = go.Figure(
    data=go.Bar(
        x=top_skills["Feature"],
        y=top_skills["TF-IDF Score"],
        marker_color='blue'
    )
)

fig.update_layout(
    title="Top Skills",
    xaxis_title="Skills",
    yaxis_title="Average TF-IDF Score",
    height=600,
    width=700,
    xaxis_tickangle=70,
    title_x=0.5
)
fig.show()

# Job posting frequency with respect to company size

In [33]:
temp=df[(df["Rating"]!=-1) & ((df["Size"]!=-1) &(df["Size"]!="Unknown") )]

In [34]:
temp=temp.groupby("Size")["Rating"].count()

In [None]:
dict(temp)

{'1 to 50 Employees': 96,
 '10000+ Employees': 3424,
 '1001 to 5000 Employees': 856,
 '201 to 500 Employees': 224,
 '5001 to 10000 Employees': 480,
 '501 to 1000 Employees': 248,
 '51 to 200 Employees': 192}

In [37]:
temp_dict = dict(temp)
fig = go.Figure(
    data=go.Bar(
        x=list(temp_dict.keys()),
        y=list(temp_dict.values()),
        marker_color='orange'
    )
)
fig.update_layout(
    title="Size to Job Posting Frequency",
    xaxis_title="Company Size",
    yaxis_title="Job Posting Frequency",
    height=700,
    width=600,
    title_x=0.5,
    xaxis_tickangle=90
)
fig.show()


# Most Job offering Companies

In [38]:
dict(df["Company Name"].value_counts().sort_values(ascending=False).head(15))

{'Microsoft': 248,
 'Apple': 232,
 'Amazon.com Services LLC': 200,
 'Google': 160,
 'TikTok': 152,
 'JPMorgan Chase & Co': 120,
 'Tesla': 80,
 'Snapchat': 80,
 'Adobe': 72,
 'Hewlett Packard': 72,
 'Meta': 64,
 'Uber': 56,
 'Amex': 48,
 'Salesforce': 48,
 'Pinterest': 40}

In [39]:
temp=dict(df["Company Name"].value_counts().sort_values(ascending=False).head(15))

In [40]:
fig = go.Figure(
    data=go.Bar(
        x=list(temp.keys()),  # Company names
        y=list(temp.values()),  # Job posting frequency
        marker_color='green'
    )
)
fig.update_layout(
    title="Top 15 Job Offering Companies",
    xaxis_title="Company Name",
    yaxis_title="Job Posting Frequency",
    height=600,
    width=700,
    title_x=0.5,
    xaxis_tickangle=90
)
fig.show()