### ANALYSIS OF GLASSDOOR DATA SCIENCE JOB POSTING 
* The Dataset used is [Data Science Job Posting on Glassdoor](https://www.kaggle.com/datasets/rashikrahmanpritom/data-science-job-posting-on-glassdoor),uploaded by Rashik Rahman on Kaggle.

In [1]:
# IMPORTING LIBRARIES
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# LOAD THE DATASET
df = pd.read_csv("../input/data-science-job-posting-on-glassdoor/Cleaned_DS_Jobs.csv")
df.head()

In [3]:
# Function for getting unique col counts
def get_unique_col_count(col):
    return "There are " + str(df[col].nunique()) + " unique values for " + col

In [4]:
df.info()

In [5]:
# Get an idea of the number of unique values
uniques = df.nunique().reset_index()
uniques.columns = ['Column','Unique Values']
fig, ax = plt.subplots(nrows=1, ncols= 1, figsize=(15,5))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y='Column', x='Unique Values', data=uniques)
plt.xticks(rotation=45);

In [6]:
# UNIVARIATE ANALYSIS


# Job Title
print(get_unique_col_count('Job Title'))

In [7]:
# Salary Estimate
print(get_unique_col_count('Salary Estimate'))

fig, ax = plt.subplots(nrows=1, ncols= 1, figsize=(15,5))
sns.histplot(ax=ax, data=df['Salary Estimate']);
plt.xticks(rotation=45);

In [8]:
# Job Description
print(get_unique_col_count('Job Description'))

In [9]:
# Rating
print(get_unique_col_count('Rating'))
sns.histplot(df['Rating'], kde=True);

In [10]:
# Company Name
print(get_unique_col_count('Company Name'))

In [15]:
# Location
print(get_unique_col_count('Location'))

# Split the state from the rest of the text
split_data = df["Location"].str.rsplit(", ", 1)
data = split_data.to_list()

locations = pd.DataFrame(data, columns=["city","state"])
print(str(locations['state'].nunique()) + " states and " + str(locations['city'].nunique()) + " cities are represented")

fig, ax = plt.subplots(nrows=1, ncols= 1, figsize=(14,8))
sns.countplot(x='state', data=locations, order = locations['state'].value_counts().index);
ax.set(title='Job listings by state');

In [12]:
dupe_rows = locations.groupby(locations.columns.tolist(),as_index=False).size()
dupe_rows = dupe_rows[dupe_rows['size'] > 1]

print(str(len(dupe_rows)) + " cities have more than one listing, with the median count per city being " + str(dupe_rows['size'].median()))

In [13]:
# Headquarters
print(get_unique_col_count('Headquarters'))

In [16]:
# Size
print(get_unique_col_count('Size'))

#sns.catplot(x='Size', y='Type of ownership', data=df);
#plt.xticks(rotation=270);

sns.countplot(y='Size',data=df,order = df['Size'].value_counts().index);

In [17]:
# Type of ownership
print(get_unique_col_count('Type of ownership'))
sns.countplot(y='Type of ownership',data=df, order= df['Type of ownership'].value_counts().index);

In [18]:
# Industry
print(get_unique_col_count('Industry'))
fig, ax = plt.subplots(nrows=1, ncols= 1, figsize=(6,14))
sns.countplot(y='Industry',data=df, order= df['Industry'].value_counts().index);

In [19]:
# Sector
print(get_unique_col_count('Sector'))
sns.countplot(y='Sector',data=df, order= df['Sector'].value_counts().index);

In [20]:
# Revenue
print(get_unique_col_count('Revenue'))
sns.countplot(y='Revenue',data=df, order= df['Revenue'].value_counts().index);

In [21]:
# min_salary, avg_salary and max_salary
print(get_unique_col_count('min_salary'))
print(get_unique_col_count('avg_salary'))
print(get_unique_col_count('max_salary'))

fig, ax = plt.subplots(nrows=1, ncols= 3, figsize=(15,5))

sns.boxplot(data=df['min_salary'].unique(), ax=ax[0], orient="v");
sns.boxplot(data=df['avg_salary'].unique(), ax=ax[1], orient="v");
sns.boxplot(data=df['max_salary'].unique(), ax=ax[2], orient="v");

ax[0].set(title='Min Salary');
ax[1].set(title='Avg Salary');
ax[2].set(title='Max Salary');

In [23]:
# Job_State
print(get_unique_col_count('job_state'))
fig, ax = plt.subplots(nrows=1, ncols= 1, figsize=(14,6))
sns.countplot(x='job_state',data=df, order= df['job_state'].value_counts().index);
ax.set(title='Jobs by State');

In [24]:
# same_state
print(get_unique_col_count('same_state'))
sns.displot(data=df['same_state']);

In [25]:
# company_age
print(get_unique_col_count('company_age'))
sns.displot(data=df['company_age'], kde=True);

In [27]:
# excel, hadoop, spark, aws, tableau, big_data

fig, ax = plt.subplots(nrows=1, ncols= 6, figsize=(14,4))
sns.histplot(data=df['excel'], ax=ax[0]);
sns.histplot(data=df['hadoop'], ax=ax[1]);
sns.histplot(data=df['spark'], ax=ax[2]);
sns.histplot(data=df['aws'], ax=ax[3]);
sns.histplot(data=df['tableau'], ax=ax[4]);
sns.histplot(data=df['big_data'], ax=ax[5]);
fig.tight_layout()

In [28]:
# job_simp
print(get_unique_col_count('job_simp'))
sns.displot(data=df['job_simp'], kde=True);
plt.xticks(rotation=45);

In [29]:
# seniority
sns.displot(df['seniority']);

## Multivariate analysis

In [30]:
#Multivariate Analysis

# Look at the correlations between features
df.corr()

In [32]:
# Put the correlations into a heatmap to better visualize
fig,ax = plt.subplots(figsize=(12, 12))   
sns.heatmap(df.corr(), ax=ax, annot=True, linewidths=0.05, fmt= '.2f',cmap="magma");

In [33]:
# How skewed is each column
df.skew()

In [34]:
# Look at a pairplot and see if anything stands out
sns.pairplot(data=df);