# Tutorial: Taming Big Data With Apache Spark and Python - Hands On!
## Assignment 2 - Total Spent Sorted

### Setup

FindSpark

This will circumvent many issues with your system finding spark

In [1]:
import findspark
findspark.init('c:/users/andy/spark')

Load Libraries

In [2]:
from pyspark import SparkConf, SparkContext

Set the file path

In [3]:
data_folder = "C:/Users/Andy/Dropbox/FactoryFloor/Repositories/Tutorial_Udemy_SparkPython/Course_Resources/"

Create the Spark Context

In [4]:
# configure your Spark context; master node is local machine
conf = SparkConf().setMaster("local").setAppName("TotalSpentSorted")

# create a spark context object
sc = SparkContext(conf = conf)

### Load the Data

In [5]:
# path to file of interest
file_to_open = data_folder + "customer-orders.csv"

# load the file; textFile breaks up a data file so that each row represents a single value in an RDD
input = sc.textFile(file_to_open)

Inspect the RDD

*CustomerID, ItemID, AmountSpent*

In [6]:
input.top(5)

['99,9562,9.72',
 '99,9559,4.14',
 '99,9474,99.74',
 '99,9398,19.43',
 '99,9332,72.35']

Function To Parse RDD

In [7]:
def parseLine(line):
    fields = line.split(',')
    customerID = int(fields[0])
    totalamount = float(fields[2])
    return (customerID, totalamount)

### Transformations

Split lines based on comma returne customerID and amount.

In [8]:
rdd = input.map(parseLine)

rdd.top(5)

[(99, 99.74), (99, 99.12), (99, 98.72), (99, 98.58), (99, 98.12)]

Reduce on key (i.e., customerID) and aggregate (i.e., sum) values.

In [9]:
totalsByID = rdd.reduceByKey(lambda x, y: (x + y))

totalsByID.top(5)

[(99, 4172.289999999998),
 (98, 4297.260000000001),
 (97, 5977.189999999995),
 (96, 3924.230000000001),
 (95, 4876.840000000002)]

Flip, so total spent is key. Sort.

In [10]:
totalsByIDSorted = totalsByID.map(lambda x: (x[1], x[0])).sortByKey()

totalsByIDSorted.top(5)

[(6375.449999999997, 68),
 (6206.199999999999, 73),
 (6193.109999999999, 39),
 (6065.389999999999, 54),
 (5995.660000000003, 71)]

### Actions

Collect the results

In [11]:
results = totalsByIDSorted.collect()

Format words to fit 'ascii', then return word and frequency.

In [13]:
for result in results:
    total = str(result[0])
    customerID = result[1]
    if customerID:
        print(customerID, total)

45 3309.38
79 3790.570000000001
96 3924.230000000001
23 4042.6499999999987
99 4172.289999999998
75 4178.500000000001
36 4278.049999999997
98 4297.260000000001
47 4316.299999999999
77 4327.729999999999
13 4367.62
48 4384.33
49 4394.599999999999
94 4475.569999999999
67 4505.79
50 4517.27
78 4524.509999999999
5 4561.069999999999
57 4628.4
83 4635.799999999997
91 4642.259999999999
74 4647.129999999999
84 4652.939999999999
3 4659.63
12 4664.589999999998
66 4681.919999999999
56 4701.019999999999
21 4707.41
80 4727.860000000001
14 4735.030000000001
37 4735.200000000002
7 4755.070000000001
44 4756.8899999999985
31 4765.05
82 4812.489999999998
4 4815.050000000002
10 4819.700000000001
88 4830.549999999999
20 4836.859999999999
89 4851.479999999999
95 4876.840000000002
38 4898.460000000002
76 4904.209999999999
86 4908.81
27 4915.889999999999
18 4921.27
53 4945.299999999999
1 4958.600000000001
51 4975.22
16 4979.06
30 4990.72
28 5000.709999999998
22 5019.449999999999
29 5032.529999999999
17 5032.67