In [None]:
from pyspark.sql import *;
from pyspark.sql.functions import *;
from pyspark.sql.window import *;
import pandas as pd;

In [None]:
spark = SparkSession.builder.appName("Q3").getOrCreate()

In [None]:
df = spark.read.csv("Cleaned_DS_Jobs.csv",header=True,inferSchema=True)
df.printSchema()
df.show()

root
 |-- Job Title: string (nullable = true)
 |-- Salary Estimate: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Type of ownership: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Sector: string (nullable = true)
 |-- job_state: string (nullable = true)
 |-- company_age: integer (nullable = true)
 |-- python: integer (nullable = true)
 |-- spark: integer (nullable = true)
 |-- tableau: integer (nullable = true)

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|
+--------------------+---------------+------+-----------------+------------------

In [None]:
df.select([count(when(isnan(c)|isnull(c),c)).alias(c) for c in df.columns]).show()

+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+
|Job Title|Salary Estimate|Rating|Location|Size|Type of ownership|Industry|Sector|job_state|company_age|python|spark|tableau|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+
|        0|              0|     0|       0|  27|               27|      71|    71|        0|          0|     0|    0|      0|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+



In [None]:
df = df.withColumn("min_salary",split(col("Salary Estimate"),"-")[0].cast("int"))
df = df.withColumn("max_salary",split(col("Salary Estimate"),"-")[1].cast("int"))
df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    Company - Public|Research & Develo...|   Business Services| 

In [None]:
df = df.withColumn("average_salary",(col("min_salary")+col("max_salary"))/2)
df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|         154.0|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    

In [None]:
df = df.withColumn("Rating",when((col("Rating")==-1) | (col("Rating")==0),1).otherwise(col("Rating")))
df.show()

+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|           Job Title|Salary Estimate|Rating|         Location|                Size|   Type of ownership|            Industry|              Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+--------------------+---------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+---------+-----------+------+-----+-------+----------+----------+--------------+
|   Sr Data Scientist|       137-171 |   3.1|     New York, NY|1001 to 5000 empl...|Nonprofit Organiz...|  Insurance Carriers|           Insurance|       NY|         27|     0|    0|      0|       137|       171|         154.0|
|      Data Scientist|       137-171 |   4.2|    Chantilly, VA|5001 to 10000 emp...|    

In [None]:
null_columns = ["Size","Type of ownership","Industry","Sector"]
for c in null_columns:
  df = df.fillna({c:-1})

df.select([count(when(isnan(c)|isnull(c),c)).alias(c) for c in df.columns]).show()

+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+----------+----------+--------------+
|Job Title|Salary Estimate|Rating|Location|Size|Type of ownership|Industry|Sector|job_state|company_age|python|spark|tableau|min_salary|max_salary|average_salary|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+----------+----------+--------------+
|        0|              0|     0|       0|   0|                0|       0|     0|        0|          0|     0|    0|      0|         0|         0|             0|
+---------+---------------+------+--------+----+-----------------+--------+------+---------+-----------+------+-----+-------+----------+----------+--------------+



In [None]:
average_salary = df.groupBy("Job Title").agg(round(avg("average_salary"),2).alias("Average Salary"))
average_salary.show()

+--------------------+--------------+
|           Job Title|Average Salary|
+--------------------+--------------+
|Senior Data Scien...|         99.33|
|Clinical Data Ana...|         164.5|
|Senior Business I...|          90.0|
|Data Analyst/Engi...|         115.5|
|Staff BI and Data...|         107.0|
|Intelligence Data...|         90.75|
|Report Writer-Dat...|          92.5|
|Hydrogen/Tritium ...|         148.0|
|Business Intellig...|        109.25|
|        Data Modeler|         154.0|
|Scientist / Group...|         197.5|
|Senior Research S...|         105.0|
|Software Engineer...|         164.5|
|   Sr Data Scientist|        126.75|
|COMPUTER SCIENTIS...|         271.5|
|Data Scientist/Ma...|         125.5|
|Data Scientist - ...|         120.5|
|  Decision Scientist|          94.5|
|Data Scientist - ...|         97.75|
|Data Scientist / ...|         128.5|
+--------------------+--------------+
only showing top 20 rows



In [None]:
company_average = df.groupBy("Size").agg(round(avg("average_salary"),2).alias("Average Salary"))
company_average.show()

+--------------------+--------------+
|                Size|Average Salary|
+--------------------+--------------+
|                  -1|         130.8|
|5001 to 10000 emp...|        126.66|
|             Unknown|        143.38|
| 51 to 200 employees|        127.83|
|1001 to 5000 empl...|        121.75|
|501 to 1000 emplo...|        120.94|
|201 to 500 employees|         119.2|
|    10000+ employees|         122.8|
|   1 to 50 employees|         120.0|
+--------------------+--------------+

