## Задача 1. Средний рейтинг
```
Реализуйте подсчет среднего рейтинга продуктов. Результат сохранить в HDFS в файле "avg_rating.csv".
Формат каждой записи: ProdId,Rating
```

In [2]:
rdd_products = sc.textFile('/user/cloudera/hw_part_1/spark/task_1/data/samples_100.json')
# rdd_products = sc.textFile('/user/cloudera/hw_part_1/spark/task_1/data/Electronics.json')

In [5]:
import json

def get_product_rating(product_json):
    product = json.loads(product_json)
    return (product.get('asin'), product.get('overall'))

rdd_product_rating = rdd_products.map(lambda product_json: get_product_rating(product_json))
rdd_product_rating.take(10)

[('0528881469', 5.0),
 ('0528881469', 1.0),
 ('0528881469', 3.0),
 ('0528881469', 2.0),
 ('0528881469', 1.0),
 ('0594451647', 5.0),
 ('0594451647', 2.0),
 ('0594451647', 5.0),
 ('0594451647', 4.0),
 ('0594451647', 5.0)]

In [15]:
rdd_product_avg_rating = (
    rdd_product_rating
        .aggregateByKey(
                (0, 0),
                lambda sum_count, rating: (sum_count[0] + rating, sum_count[1] + 1),
                lambda sum_count_x, sum_count_y: (sum_count_x[0] + sum_count_y[0],
                                                  sum_count_x[1] + sum_count_y[1])
            )
        .mapValues(lambda sum_count: sum_count[0] / sum_count[1])
)
print(rdd_product_avg_rating.collect())

def to_csv_line(data):
    delimiter = ','
    fields = []
    for item in data:
        field = str(item)
        if delimiter in field:
            field = '"{0}"'.format(field)
        fields.append(field)
    return delimiter.join(fields)

print(rdd_product_avg_rating.map(to_csv_line).collect())

[('0972683275', 4.390243902439025), ('0528881469', 2.4), ('0594451647', 4.2), ('0594481813', 4.0)]
['0972683275,4.390243902439025', '0528881469,2.4', '0594451647,4.2', '0594481813,4.0']


```python
# product_avg_rating.py
import json
import sys
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName('ProductAvgRating')
sc = SparkContext(conf=conf)

rdd_products = sc.textFile(sys.argv[1])

def get_product_rating(product_json):
    product = json.loads(product_json)
    return (product.get('asin'), product.get('overall'))

rdd_product_rating = rdd_products.map(lambda product_json: get_product_rating(product_json))
rdd_product_avg_rating = (
    rdd_product_rating
        .aggregateByKey(
                (0, 0),
                lambda sum_count, rating: (sum_count[0] + rating, sum_count[1] + 1),
                lambda sum_count_x, sum_count_y: (sum_count_x[0] + sum_count_y[0],
                                                  sum_count_x[1] + sum_count_y[1])
            )
        .mapValues(lambda sum_count: sum_count[0] / sum_count[1])
)

def to_csv_line(data):
    delimiter = ','
    fields = []
    for item in data:
        field = str(item)
        if delimiter in field:
            field = '"{0}"'.format(field)
        fields.append(field)
    return delimiter.join(fields)

rdd_product_avg_rating.map(to_csv_line).saveAsTextFile(sys.argv[2])
```

## Задача 2. Добавление наименования продукта
```
Напишите программу, которая каждому ProdId из "avg_rating.csv" ставит в соответстие названием продукта.
Результат сохранить в HDFS в файле "prodname_avg_rating.csv": ProdId,Name,Rating
```

In [11]:
rdd_products = sc.textFile('/user/cloudera/hw_part_1/spark/task_2/data/avg_rating.csv')
#rdd_products_meta = sc.textFile('/user/cloudera/hw_part_1/spark/task_2/data/samples_100_meta.json')
rdd_products_meta = sc.textFile('/user/cloudera/hw_part_1/spark/task_2/data/Electronics_meta.json')

In [29]:
rdd_product_avg_rating = rdd_products.map(lambda csv_line: csv_line.split(','))
rdd_product_avg_rating.persist()
rdd_product_avg_rating.take(5)

[['B00ARSNT7Q', '4.29411764706'],
 ['B001UHMVC2', '4.55555555556'],
 ['B001WMFXJ8', '4.5'],
 ['B002YNY8GI', '4.46511627907'],
 ['B008JH59FC', '4.22222222222']]

In [30]:
import ast
import json

def get_product_name(product_json):
    # Hack for "JSONDecodeError: Expecting property name enclosed in double quotes"
    product = ast.literal_eval(product_json)
    return (product.get('asin'), product.get('title'))

rdd_product_name = (
    rdd_products_meta
        .map(lambda product_json: get_product_name(product_json))
        .filter(lambda product_data: all(product_data))
)
rdd_product_name.persist()
rdd_product_name.take(5)

[('0132793040',
  'Kelby Training DVD: Mastering Blend Modes in Adobe Photoshop CS5 By Corey Barker'),
 ('0321732944',
  'Kelby Training DVD: Adobe Photoshop CS5 Crash Course By Matt Kloskowski'),
 ('0439886341', 'Digital Organizer and Messenger'),
 ('0511189877', 'CLIKR-5 Time Warner Cable Remote Control UR5U-8780L'),
 ('0528881469',
  'Rand McNally 528881469 7-inch Intelliroute TND 700 Truck GPS')]

In [37]:
rdd_product_name_rating = rdd_product_avg_rating.join(rdd_product_name)
# print(rdd_product_name_rating.take(5))
# [('B008JH59FC', ('4.22222222222', 'Logitech 910-002951 M325 Wireless Mouse - Coral Fan')),]

def to_csv_line(data):
    delimiter = ','
    fields = []
    for item in data:
        field = str(item)
        if delimiter in field:
            field = '"{0}"'.format(field)
        fields.append(field)
    return delimiter.join(fields)

print((
    rdd_product_name_rating
        .map(lambda id__rating_name: (id__rating_name[0], id__rating_name[1][1], id__rating_name[1][0]))
        .map(to_csv_line)
        .take(5)
))


['B008JH59FC,4.22222222222,Logitech 910-002951 M325 Wireless Mouse - Coral Fan', 'B00C5R8A6W,4.66666666667,"Leef Spark USB 2.0 32GB High-speed USB Flash Drive with Magnet Cap, LED, and PrimeGrade Memory (Black)"', 'B0083E7X64,2.83333333333,"HHI Anti-Fingerprint, Anti-Glare, Matte Finished Screen Protector for Toshiba Excite 10 (Package include a HandHelditems Sketch Stylus Pen)"', 'B003OBUJGW,3.66666666667,Sony ECMSST1 Compact Stereo Microphone for NEX-3/NEX-5 Camera', 'B004LPW2LA,3.71428571429,SANOXY&reg; high speed Mini USB Wireless Lan Adaptor 802.11n']


```python
# product_name_rating.py
import ast
import json
import sys
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName('ProductNameRating')
sc = SparkContext(conf=conf)

rdd_products = sc.textFile(sys.argv[1])
rdd_products_meta = sc.textFile(sys.argv[2])

rdd_product_avg_rating = rdd_products.map(lambda csv_line: csv_line.split(','))
rdd_product_avg_rating.persist()

def get_product_name(product_json):
    # Hack for "JSONDecodeError: Expecting property name enclosed in double quotes"
    product = ast.literal_eval(product_json)
    return (product.get('asin'), product.get('title'))

rdd_product_name = (
    rdd_products_meta
        .map(lambda product_json: get_product_name(product_json))
        .filter(lambda product_data: all(product_data))
)
rdd_product_name.persist()

def to_csv_line(data):
    delimiter = ','
    fields = []
    for item in data:
        field = str(item)
        if delimiter in field:
            field = '"{0}"'.format(field)
        fields.append(field)
    return delimiter.join(fields)

rdd_product_name_rating = rdd_product_avg_rating.join(rdd_product_name)
(
    rdd_product_name_rating
        .map(lambda id__rating_name: (id__rating_name[0], id__rating_name[1][1], id__rating_name[1][0]))
        .map(to_csv_line)
        .saveAsTextFile(sys.argv[3])
)
```

## Задача 3. Поиск среднего рейтинга по названию продукта
```
Напишите программу, которая выводит средний рейтинги всех продуктов из "prodname_avg_rating.csv",
в названии которых встречается введенное при запуске слово: ProdId,Name,Rating
```

In [2]:
rdd_products = sc.textFile('/user/cloudera/hw_part_1/spark/task_3/data/prodname_avg_rating.csv')
rdd_products.persist()

/user/cloudera/hw_part_1/spark/task_3/data/prodname_avg_rating.csv MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:0

In [5]:
filter_word = 'Nook'

def filter_by_title(title):
    return filter_word.lower() in title.lower()

(
    rdd_products
        .filter(lambda id_title_rating_csv_line: filter_by_title(id_title_rating_csv_line.split(',')[1]))
        .take(5)
)

['B00C2L6MAW,GMYLE(TM) Light Blue PU Leather Slim Folio Magnetic Flip Stand Case Cover with Wake Up Sleep Function for Barnes &amp; Noble Nook HD+ Plus 9 &quot; inches Tablet,4.71428571429',
 'B007EWTZ8U,"Devicewear Dante 360 Degree Rotating Case for Nook Tablet/Nook Color, Pink (DAN-NT-PNK)",4.5',
 'B00CPV9YOU,GMYLE(R) Black 360 Degree Rotating PU leather Folio Stand Case Cover for Nook HD+ Plus 9 inches Barnes &amp; Noble e-book Reader Tablet (Multi Angle- Vertical / Horizontal and Wake up Sleep Function),4.23333333333',
 'B004GIKP1G,rooCASE (Black) Leather Case Cover with 22 Angle Adjustable Stand for Barnes and Noble NOOK Tablet / NOOKcolor Nook Color eBook Reader - MV Series (NOT Compatible with NOOK HD),4.35849056604',
 'B00AAVEUCS,"MoKo(TM) 360 Degree Rotating Cover Case for Barnes &amp; Noble Nook Full HD 7&quot; Inch tablet, Black (with Vertical and Horizontal Stand, and Smart Cover Auto Sleep/Wake Function)--Lifetime Warranty",4.5']

```python
# product_filtered.py
import sys
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName('ProductFiltered')
sc = SparkContext(conf=conf)

rdd_products = sc.textFile(sys.argv[1])
rdd_products.persist()

filter_word = sys.argv[3]

def filter_by_title(title):
    return filter_word.lower() in title.lower()

(
    rdd_products
        .filter(lambda id_title_rating_csv_line: filter_by_title(id_title_rating_csv_line.split(',')[1]))
        .saveAsTextFile(sys.argv[2])
)
```

### Пример diff'а между map_reduce и spark результатами
```
< B00B20N9GO,Poetic Slimline Portfolio Case for Barnes &amp; Noble Nook Full HD 7&quot; Inch Tablet Black (3 Year Manufacturer Warranty From Poetic),4.027027027027027
< B00B25I8DS,AC Adapter Charger Power Cord for Nook Color Tablet ONLY!,4.111111111111111
---
> B00B20N9GO,Poetic Slimline Portfolio Case for Barnes &amp; Noble Nook Full HD 7&quot; Inch Tablet Black (3 Year Manufacturer Warranty From Poetic),4.02702702703
> B00B25I8DS,AC Adapter Charger Power Cord for Nook Color Tablet ONLY!,4.11111111111
```