 Analysis of source data


In [2]:
!which spark-submit
!echo $SPARK_HOME

/opt/spark


In [3]:
import findspark
findspark.init()
import pyspark
print(pyspark.__version__)

2.4.5


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime

ModuleNotFoundError: No module named 'pyspark'

In [None]:

    
spark = SparkSession.builder\
        .appName("Newyork_jobs_analysis")\
        .getOrCreate()
df = spark.read.csv("/dataset/nyc-jobs.csv",header=True,inferSchema=True)
df.printSchema()
df.show(5)
df.describe()
                

DATA CLEANING FUNCTIONS

In [None]:
def clean_columns(df):
    for col_name in df.columns:
        df = df.withColumnRenamed(col_name,col_name.lower().replace(" ","_" ))
    return df

df = clean_columns(df)

PROCESSING DATE FUNCTION:

In [None]:
def process_dates(df):
    df = df.withColumn("posting_date", to_date(col("posting_date"), "MM/dd/yyyy"))
    df = df.withColumn("year", year(col("posting_date")))
    df = df.withColumn("month", month(col("posting_date")))
    return df

df = process_dates(df)

Average salary calculation

In [None]:
def process_salary(df):
    """
    Compute avg_salary = (salary_range_from + salary_range_to) / 2
    and return the values as a dataframe
    """
    df = df.withColumn(
        "avg_salary",
        (col("salary_range_from") + col("salary_range_to")) / 2
    )
    return df

df = process_salary(df)

KPI 1  -- NUMBER OF JOBS PER CATEGORY

In [None]:
top_jobs_category = df.groupBy("job_category")\
                    .count()\
                    .orderBy(desc("count"))\
                    .limit(10)

top_jobs_category.show()

KPI 2 -- SALARY DISTRIBUTION PER JOB CATEGORY

In [None]:
salary_by_cateogry = df.groupBy("job_categrory")\
                        .agg(avg("avg_salary").alias("avg_salary"))\
                        .orderBy(desc("avg_salary"))
salary_by_cateogry.show()

KPI 3 -- Correlation Between Degree & Salary

In [None]:
def extract_degree():
    return when(lower(col("minimum_qualifications")).contains("phd"), "PhD") \
        .when(lower(col("minimum_qualifications")).contains("master"), "Masters") \
        .when(lower(col("minimum_qualifications")).contains("bachelor"), "Bachelors") \
        .otherwise("No Degree")

df = df.withColumn("degree_level", extract_degree())

df = df.withColumn(
    "degree_encoded",
    when(col("degree_level") == "PhD", 3)
    .when(col("degree_level") == "Masters", 2)
    .when(col("degree_level") == "Bachelors", 1)
    .otherwise(0)
)

df.stat.corr("degree_encoded", "avg_salary")

KPI 4 -- job posting having the highest salary per agency

In [None]:
##from pyspark.sql.functions import window
w = window.partitionBy("Agency").orderBy(desc("avg_salary"))

highest_salary_per_agency = (
    df.withColumn("rank", row_number().over(w))
      .filter(col("rank") == 1)
      .select("agency", "business_title", "avg_salary")
)

highest_salary_per_agency.show()

KPI 5 --  job positings average salary per agency for the last 2 yearS

In [None]:
two_years_ago = datetime.now().year - 2

df_recent = df.filter(col("year") >= two_years_ago)

avg_salary_last_2_years = (
    df_recent.groupBy("agency")
             .agg(avg("avg_salary").alias("avg_salary"))
             .orderBy(desc("avg_salary"))
)

avg_salary_last_2_years.show()

KPI 6 -- highest paid skills in the US market

In [None]:
#for this KPI I understand this can be done with Regex. As data doesnt have 
#skills column we can do it with the help of 2 different columns which is skills preferred and job desciption by exploding the values of it 

FEATURE REMOVAL

In [None]:
columns_to_drop = [
    "salary_range_from",
    "salary_range_to"
]

df_final = df.drop(*columns_to_drop)

WRITE TO FINAL TABLE

In [None]:
df_final.write.mode("overwrite").parquet("nyc_jobs_processed")

In [None]:
df = spark.read.csv("/dataset/nyc-jobs.csv", header=True)
df.printSchema()

root
 |-- Job ID: string (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: string (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: string (nullable = true)
 |-- Salary Range To: string (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/Shift: string (nullable = true)
 |-- Work Locatio

### Sample function

In [48]:
def get_salary_frequency(df: DataFrame) -> list:
    row_list = df.select('Salary Frequency').distinct().collect()
    return [row['Salary Frequency'] for row in row_list]

### Example of test function

In [65]:
mock_data = [('A', 'Annual'), ('B', 'Daily')]
expected_result = ['Annual', 'Daily']

In [66]:
def test_get_salary_frequency(mock_data: list, 
                              expected_result: list,
                              schema: list = ['id', 'Salary Frequency']):  
    mock_df = spark.createDataFrame(data = mock_data, schema = schema)
    assert get_salary_frequency(mock_df) == expected_result