## Spark 101 Exercises
### Corey Solitaire
`11.24.2020`

In [2]:
import pyspark.sql.functions as F
import pyspark
import pandas as pd
import numpy as np

from pydataset import data
from vega_datasets import data

from pyspark.sql.functions import col, expr, concat, sum, avg, min, max, count, mean, round
from pyspark.sql.functions import lit, regexp_extract, regexp_replace, when,asc, desc, month, year, quarter  

***
## Method to create spark session

In [4]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

***
## Reference Material

In [6]:
pd_v_spark = pd.DataFrame([['pd.read_csv("myfile.csv")', 
                            'spark.read.load("myfile.csv", format = "csv", sep = ",")'], 
                           ['pd.read_json("myfile.json")', 
                            'spark.read.load("myfile.json", format = "json") OR spark.read.json("myfile.json")']], 
                          index = ['csv', 'json'], 
                          columns = ['pandas', 'spark'])

pd_v_spark = pd_v_spark.append(pd.DataFrame([['pd_df.head()', 'sp_df.show(), .head(), .take()'],
                                             ['pd_df.head(1)', 'sp_df.first()'],
                                             ['pd_df.describe()', 'sp_df.describe()'],
                                             ['pd_df.columns', 'sp_df.columns'],
                                             ['len(pd_df)', 'sp_df.count()'],
                                             ['len(pd_df.drop_duplicates())', 'sp_df.distinct().count()'],
                                             ['pd_df.info()', 'sp_df.printSchema()']
                                            ],
                                            index = ['1st n rows', '1st row','summary statistics', 
                                                     'column names', '# rows', '# distinct rows', 
                                                     'df schema info'], 
                                            columns = ['pandas', 'spark']))

pd_v_spark = pd_v_spark.append(pd.DataFrame([['pd_df[["col1", "col2"]]', 
                                              'sp_df.select(sp_df.col1, sp_df.col2)']
                                            ],
                                            index = ['select columns'], 
                                            columns = ['pandas', 'spark']))

pd_v_spark = pd_v_spark.append(pd.DataFrame([['pd_df[pd_df.c1 > 0]', 'sp_df.filter(df.c1 > 0), sp_df.where(df.c1 > 0)'],
                                            ],
                                            index = ['conditional filtering'], 
                                            columns = ['pandas', 'spark']))

pd_v_spark = pd_v_spark.append(pd.DataFrame([['np.where(pd_df.c1.array > 0, "positive")', 
                                              'sp_df.select(df.c1, when(df.c1 > 0, "positive").alias("number_sign"))'],
                                            ],
                                            index = ['conditional assigning'], 
                                            columns = ['pandas', 'spark']))

pd_v_spark = pd_v_spark.append(pd.DataFrame([['np.where(pd_df.c1.array > 0, "pos", "neg")', 
                                              'sp_df.select(df.c1, when(df.c1 > 0, "pos").otherwise("neg").alias("number_sign"))'],
                                            ],
                                            index = ['conditional assigning with else'], 
                                            columns = ['pandas', 'spark']))


pd_v_spark = pd_v_spark.append(pd.DataFrame([['pd_df.sort_values(by=["c1"])', 
                                              'sp_df.sort(sp_df.c1)'],
                                             ['pd_df.sort_values(by=["c1","c2"])',
                                              'sp_df.sort(sp_df.c1, sp_df.c2)'],
                                             ['pd_df.sort_values(by=["c1","c2"], ascending=[False, True])',
                                              'sp_df.sort(sp_df.c1.desc(), sp_df.c2)'],
                                             ['pd_df.sort_values(by=["c1","c2"], ascending=False)', 
                                              'sp_df.sort(desc("c1"), desc("c2")) OR sp_df.sort(col("c1").desc(), col("c2").desc())']
                                            ],
                                            index = ['sort 1 col asc', 'sort 2+ cols asc', 'sort 2+ cols desc/asc', 'sort 2+ cols desc'], 
                                            columns = ['pandas', 'spark']))
pd_v_spark

Unnamed: 0,pandas,spark
csv,"pd.read_csv(""myfile.csv"")","spark.read.load(""myfile.csv"", format = ""csv"", sep = "","")"
json,"pd.read_json(""myfile.json"")","spark.read.load(""myfile.json"", format = ""json"") OR spark.read.json(""myfile.json"")"
1st n rows,pd_df.head(),"sp_df.show(), .head(), .take()"
1st row,pd_df.head(1),sp_df.first()
summary statistics,pd_df.describe(),sp_df.describe()
column names,pd_df.columns,sp_df.columns
# rows,len(pd_df),sp_df.count()
# distinct rows,len(pd_df.drop_duplicates()),sp_df.distinct().count()
df schema info,pd_df.info(),sp_df.printSchema()
select columns,"pd_df[[""col1"", ""col2""]]","sp_df.select(sp_df.col1, sp_df.col2)"


***
## 1. Create a spark data frame that contains your favorite programming languages.

### -The name of the column should be language

In [None]:
# Create pandas dataframe by columns using dictionary-like object

pd_df = pd.DataFrame({'col1': ['r1c1', 'r2c1', 'r3c1'],
                      'col2': ['r1c2', 'r2c2', 'r3c2'],
                      'col3': ['r1c3', 'r3c3', 'r3c3']
                        }, 
                     index = [1, 2, 3])
pd_df

### -View the schema of the dataframe

### -Output the shape of the dataframe

### -Show the first 5 records in the dataframe

***
## 2. Load the mpg dataset as a spark dataframe.

### - A. Create 1 column of output that contains a message like the one below:
   `The 1999 audi a4 has a 4 cylinder engine.`

### - B. For each vehicle.
        - Transform the trans column so that it only contains either manual or auto.
 

***
## 3. Load the tips dataset as a spark dataframe.

### - A. What percentage of observations are smokers?
### - B. Create a column that contains the tip percentage
### - C. Calculate the average tip percentage for each combination of sex and smoker.

***
## 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

### -Convert the temperatures to farenheight.
### -Which month has the most rain, on average?
### -Which year was the windiest?
### -What is the most frequent type of weather in January?
### -What is the average high and low tempurature on sunny days in July in 2013 and 2014?
### -What percentage of days were rainy in q3 of 2015?
### -For each year, find what percentage of days it rained (had non-zero precipitation).

