# Set up packages and dataframes

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
DATA_FOLDER = 'data/'

transactions = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'sales_train.csv'), 
    )

items = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'items.csv'), 
    )

item_categories = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'item_categories.csv'), 
    )

shops = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'shops.csv'), 
    )

test = spark.read.options(    
        header=True,  
        inferSchema=True
    ).csv(
        os.path.join(DATA_FOLDER, 'test.csv'), 
    )

# Reproduce previous_value_benchmark

Make a copy of the transactions dataframe with reformatted 'date' column.

In [4]:
trans = transactions.withColumn('date', F.to_date(transactions.date, format='dd.MM.yyyy'))

Extract the day, month and year from the date.

In [5]:
trans = trans.withColumn('day', F.dayofyear(trans.date))
trans = trans.withColumn('month', F.month(trans.date))
trans = trans.withColumn('year', F.year(trans.date))

Aggregate total sales for the previous month (October 2015).

In [6]:
oct_2015 = trans[(trans.year == 2015) & (trans.month == 10)]

oct_2015.createOrReplaceTempView('oct_2015')

right = spark.sql(('SELECT shop_id, item_id, SUM(item_cnt_day) AS item_cnt_month '
                   ' FROM oct_2015 '
                   ' GROUP BY shop_id, item_id'))

Create previous_value_benchmark by left joining total October 2015 sales on the test dataframe.

In [8]:
right.createOrReplaceTempView('right')
test.createOrReplaceTempView('test')

previous_value_benchmark = spark.sql(('SELECT test.ID, right.item_cnt_month '
                                      ' FROM test '
                                      ' LEFT JOIN right '
                                      '  ON test.shop_id = right.shop_id '
                                      '  AND test.item_id = right.item_id '))

Fill null values with 0 and clip the values into the [0,20] range.

In [None]:
previous_value_benchmark = previous_value_benchmark.fillna(0)

previous_value_benchmark.createOrReplaceTempView('previous_value_benchmark')

previous_value_benchmark = spark.sql(('SELECT ID, '
                                      '  CASE '
                                      '    WHEN item_cnt_month < 20 THEN item_cnt_month '
                                      '    ELSE 20 '
                                      '  END AS item_cnt_month '
                                      ' FROM previous_value_benchmark '))

Write the result to `csv`.

In [9]:
previous_value_benchmark.toPandas().to_csv('data/previous_value_benchmark.csv', index=False)