In [1]:
import pyspark
import pandas as pd
import numpy as np
from pydataset import data as pydata
from vega_datasets import data
from pyspark.sql.functions import *

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
pd_df = pd.DataFrame(['c++','python','cobol','c#'],
                    columns=(['language']))
pd_df

Unnamed: 0,language
0,c++
1,python
2,cobol
3,c#


In [4]:
sp_df = spark.createDataFrame(pd_df)
sp_df.show()

+--------+
|language|
+--------+
|     c++|
|  python|
|   cobol|
|      c#|
+--------+



In [5]:
sp_df.describe().show()

+-------+--------+
|summary|language|
+-------+--------+
|  count|       4|
|   mean|    null|
| stddev|    null|
|    min|      c#|
|    max|  python|
+-------+--------+



In [6]:
sp_df.printSchema()

root
 |-- language: string (nullable = true)



In [7]:
sp_df.count()

4

In [8]:
mpg = spark.createDataFrame(pydata('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [9]:
message1 = concat(lit('The '), col('year'), lit(' '),
col('manufacturer'), lit(' '), col('model'), 
lit(' has a '), col('cyl'), lit(" cylinder engine."))

In [10]:
mpg.select(message1.alias('message')).show(5, truncate=False)

+-----------------------------------------+
|message                                  |
+-----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 6 cylinder engine.|
+-----------------------------------------+
only showing top 5 rows



In [12]:
mpg.select(substring_index('trans', '(', 1).alias('tran')).show(5)

+------+
|  tran|
+------+
|  auto|
|manual|
|manual|
|  auto|
|  auto|
+------+
only showing top 5 rows



In [13]:
tips = spark.createDataFrame(pydata('tips'))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [14]:
tips.where(tips.smoker =='Yes').count() / tips.count()

0.38114754098360654

In [15]:
tips.groupBy('smoker').count().withColumn('percent', 
            concat(round(col('count')/tips.count()*100,0)
                   .cast('int'), lit('%'))).show()

+------+-----+-------+
|smoker|count|percent|
+------+-----+-------+
|    No|  151|    62%|
|   Yes|   93|    38%|
+------+-----+-------+



In [16]:
smoker_prop = tips.groupBy('smoker').count()

smoker_prop.withColumn('percent', col('count')/tips.count()).show()

+------+-----+-------------------+
|smoker|count|            percent|
+------+-----+-------------------+
|    No|  151| 0.6188524590163934|
|   Yes|   93|0.38114754098360654|
+------+-----+-------------------+



In [18]:
tip_percent = round((tips.tip / tips.total_bill), 4)\
                .alias('tip_percent')
tips.withColumn('tip_percent', tip_percent).show(9)

+----------+----+------+------+---+------+----+-----------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_percent|
+----------+----+------+------+---+------+----+-----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|     0.0594|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|     0.1605|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|     0.1666|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|     0.1398|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|     0.1468|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|     0.1862|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|     0.2281|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|     0.1161|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|     0.1303|
+----------+----+------+------+---+------+----+-----------+
only showing top 9 rows



In [20]:
tips.groupBy('sex','smoker').agg(round(avg(tip_percent), 4)
                    .alias('tip_percent')).show(5)

+------+------+-----------+
|   sex|smoker|tip_percent|
+------+------+-----------+
|  Male|    No|     0.1607|
|  Male|   Yes|     0.1528|
|Female|    No|     0.1569|
|Female|   Yes|     0.1821|
+------+------+-----------+



In [21]:
weather = data.seattle_weather().assign(date=
                                lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



In [40]:
C_max = round((weather.temp_max * 9/5) + 32, 2)
C_min = round((weather.temp_min * 9/5) + 32, 2)
# weather.select(C_max.alias('C_max') , C_min.alias('C_min')).show(5)
weather.withColumn('temp_max', C_max)\
        .withColumn('temp_min', C_min).show()

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05|          1.3|   48.02|   37.04| 6.1|   rain|
|2012-01-06|          2.5|   39.92|   35.96| 2.2|   rain|
|2012-01-07|          0.0|   44.96|   37.04| 2.3|   rain|
|2012-01-08|          0.0|    50.0|   37.04| 2.0|    sun|
|2012-01-09|          4.3|   48.92|    41.0| 3.4|   rain|
|2012-01-10|          1.0|   42.98|   33.08| 3.4|   rain|
|2012-01-11|          0.0|   42.98|   30.02| 5.1|    sun|
|2012-01-12|          0.0|   42.98|   28.94| 1.9|    sun|
|2012-01-13|          0.0|    41.0|   26.96| 1.3|    sun|
|2012-01-14|          4.1|   39.92|   33.08| 5.3|   snow|
|2012-01-15|  

In [42]:
weather.withColumn('month', month('date')).groupBy('month')\
        .agg(sum('precipitation').alias('avg_rain'))\
        .sort('avg_rain', ascending=False).show()

+-----+------------------+
|month|          avg_rain|
+-----+------------------+
|   11|             642.5|
|   12|             622.7|
|    3|             606.2|
|   10|             503.4|
|    1|465.99999999999994|
|    2|             422.0|
|    4|             375.4|
|    9|235.49999999999997|
|    5|             207.5|
|    8|             163.7|
|    6|             132.9|
|    7|              48.2|
+-----+------------------+



In [41]:
weather.withColumn('year', year('date')).groupBy('year')\
        .agg(sum('wind').alias('avg_wind'))\
        .sort('avg_wind', ascending=False).show()

+----+------------------+
|year|          avg_wind|
+----+------------------+
|2012|1244.6999999999998|
|2014|1236.5000000000005|
|2015|            1153.3|
|2013|1100.8000000000002|
+----+------------------+



In [44]:
weather.where(month('date') == 1).select('weather')\
        .groupBy('weather').count().sort('count', ascending=False)\
        .show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



In [28]:
weather.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [None]:
# (weather
#     .filter(month('date') == 7
#     .filter(year('date') > 2012
#     .filter(year('date') < 2015)
#     .filter(col('weather') == lit('sun')
#            .agg(avg)))))

In [30]:
q3_2015_rainy_days = \
            weather.where((quarter('date') == 3) & \
                          (year('date') == 2015))\
                   .where(weather.weather =='rain').count()
q3_2015_days = weather.where((month('date').isin([7,8,9])) & \
                             (year('date') == 2015)).count()
q3_2015_rainy_days / q3_2015_days

0.021739130434782608

In [47]:
(weather
    .filter(year('date') == 2015)
    .filter(quarter('date') == 3)
    .select(when(col('weather') == 'rain', 1).otherwise(0)
    .alias('rain'))
    .agg(mean('rain')).show())

+--------------------+
|           avg(rain)|
+--------------------+
|0.021739130434782608|
+--------------------+



In [32]:
weather.where(weather.precipitation > 0).groupBy(year('date'))\
            .count().sort('year(date)').show()

+----------+-----+
|year(date)|count|
+----------+-----+
|      2012|  177|
|      2013|  152|
|      2014|  150|
|      2015|  144|
+----------+-----+



In [49]:
weather.where(weather.precipitation > 0).groupBy(year('date'))\
            .count().sort('year(date)')\
            .withColumn('percent', round(col('count')/365, 3))\
            .show()

+----------+-----+-------+
|year(date)|count|percent|
+----------+-----+-------+
|      2012|  177|  0.485|
|      2013|  152|  0.416|
|      2014|  150|  0.411|
|      2015|  144|  0.395|
+----------+-----+-------+



In [None]:
# (weather
#     .withColumn('rain', when(col('precipitation') > 0, 1)
#     .otherwise(0)))
#     .groupBy(year)

In [34]:
# rainy = weather.when()

In [None]:
# days_rained = weather.where(weather.precipitation > 0)\
#                     .groupBy(F.year('date')).count()
# weather.select(days_rained)