## Demo - Working with PySpark Dataframes

#### First - read the data we uploaded into Datbricks, into a PySpark dataframe

In [3]:
from pyspark.sql.functions import *
from pyspark.sql.dataframe import *
from pyspark.sql.types import *

In [4]:
# File location and type
file_location = "/FileStore/tables/winemag_data_first150k-8c60b.csv"
file_type = "csv"

# The applied options are for CSV files. For other file types, these will be ignored.
wine_df = spark.read.format("csv") \
  .option("inferSchema", "false") \
  .option("header", "true") \
  .option("sep", ",") \
  .load(file_location)

In [5]:
# Let's take a look at the wine dataframe
wine_df.show()

### Explore the dataset - using .describe(), .printSchema(), and .count()

In [7]:
# Check out the dataframe using .describe()
wine_df.describe()

In [8]:
wine_df.printSchema()

In [9]:
# How many rows do we have?
wine_df.count()

### Let's select the columns we are interested in...

In [11]:
# dataframes Select columns - country, designation, points, price, province, variety
new_wine_df = wine_df.select(['country', 'designation', 'points', 'price', 'province', 'variety'])

In [12]:
# Let's take a look at the new dataframe
new_wine_df.show()

## I'm not that familiar with wine variety names - can we work with wine colours instead?

In [14]:
# Create a mapping table that maps wine variety to wine colour
# Below is a dataframe schema - defining the column names, and data types for our new wine colour reference table
schema = StructType([StructField('Variety',StringType()),
                     StructField('Colour',StringType())])

In [15]:
# Now - let's fill our reference table, mapping wine varieties to wine colours
vals = [('Chardonnay','White'),
        ('Pinot Noir','White'),
        ('Cabernet Sauvignon','White'),
        ('Riesling','White'),
        ('Merlot','Red'),
        ('Zinfandel','Red'),
        ('Malbec','Red'),
        ('Shiraz','Red'),
        ('Sangiovese','Red')]

In [16]:
# Let's create our mapping dataframe from these values
colour_mapping_df = spark.createDataFrame(vals, schema = schema)

In [17]:
# Let's take a look at the mapping dataframe
colour_mapping_df.show()

In [18]:
# Now - let's finally join our two dataframes to get a wine colour column added to our original dataset
# First, create aliases
new = new_wine_df.alias('new')
mapping = colour_mapping_df.alias('mapping')


wine_df_with_colours = new_wine_df.join(colour_mapping_df, new_wine_df.variety == colour_mapping_df.Variety, how = "left") \
.withColumnRenamed('Colour', 'colour') \
.drop(colour_mapping_df.Variety)

In [19]:
# Let's take a look at our new dataframe
wine_df_with_colours.printSchema()

### There are lots of nulls in the colour column now - let's remove those

In [21]:
# We were too lazy to map all of the wine varieties - so we will have some rows with NULL in the colour column now
wine_df_with_colours.where(wine_df_with_colours.colour.isNull()).show()

In [22]:
# let's get rid of these nulls
transformed_wine_df = wine_df_with_colours.where(wine_df_with_colours.colour.isNotNull())
transformed_wine_df.show()

### I don't like US wine - let's filter that out!

In [24]:
# Filter out rows where country is equal to US
non_us_wines_df = transformed_wine_df.where(~(transformed_wine_df.country == 'US'))
non_us_wines_df.show()

### What is the average, max and min price of each wine colour?

In [26]:
# Average price of each wine colour
avg_price_by_colour = non_us_wines_df.groupBy(non_us_wines_df.colour).agg(mean(non_us_wines_df.price).alias('avg_price'))
avg_price_by_colour.show()

In [27]:
# How much is the cheapest bottle of each colour of wine?
min_price_by_colour = non_us_wines_df.groupBy(non_us_wines_df.colour).agg(min(non_us_wines_df.price).alias('min_price'))
min_price_by_colour.show()

In [28]:
# How much is the most expensive bottle of each colour of wine?
max_price_by_colour = non_us_wines_df.groupBy(non_us_wines_df.colour).agg(max(non_us_wines_df.price).alias('max_price'))
max_price_by_colour.show()

### Having individual points scores isn't that useful - shall we turn this into a categorical variable?

In [30]:
# First convert the points column to an integer from a string
non_us_wines_df_transformed = non_us_wines_df.withColumn("points_int", non_us_wines_df.points.cast(IntegerType())).drop('points')
non_us_wines_df_transformed.show()

In [31]:
# Create udf to assign the correct rating to each row - 'user defined function'
assign_rating = udf(lambda points: 'low-rated' if points <= 85 else 'mid-rated' if points > 86 and points <= 95 else 'high-rated')

In [32]:
# Transform points column to show low, mid and high-rated wines
final_df = non_us_wines_df_transformed.withColumn('point_rating_category', assign_rating(non_us_wines_df_transformed.points_int)) \
.withColumnRenamed('points_int', 'points')

In [33]:
final_df.show()