## Вариант 

In [1]:
surname = 'Телышев'  # Ваша фамилия

alphabet = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
variants = (1, 42, 21, 21, 34,  6, 44, 26, 18, 44, 38, 26, 14, 43,  4, 49,
            45, 7, 42, 29,  4,  9, 36, 34, 31, 29,  5, 30,  4, 19, 28, 25, 33)

char_num_map = dict(zip(alphabet, variants))
variant =  sum([char_num_map[char] for char in surname.lower()]) % 40 + 1

print("Задача № 1: ", variant % 3 + 1)
print("Задача № 2: ", variant % 2 + 1 )

Задача № 1:  1
Задача № 2:  1


## Задача 1. Анализ датасета (2 балла)
### Вариант 1. Animation, Romance, Documentary

In [2]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [3]:
import pyspark.sql.functions as F

from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType

# movieId,title,genres
# 1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
MovieSchema = StructType((
    StructField('Id', LongType(), nullable=False),
    StructField('Title', StringType(), nullable=False),
    StructField('Genres', StringType(), nullable=True),   
))


# userId,movieId,rating,timestamp
# 1,1,4.0,964982703
RatingSchema = StructType((
    StructField('UserId', LongType(), nullable=False),
    StructField('MovieId', LongType(), nullable=False),
    StructField('Rating', DoubleType(), nullable=False),  
    StructField('Timestamp', LongType(), nullable=True),  # TODO(a.telyshev): TimestampType doesn't working
))


class MovieAnalyzer:
    GENRES = ('Animation', 'Romance', 'Documentary')  # Вариант 1
    
    def __init__(self, movies_data_path, ratings_data_path, genres=GENRES):
        print('*' * 50)
        self.movies_df = self._get_df_from_csv(movies_data_path, MovieSchema)
        self.movies_df.persist()
        self.movies_df.printSchema()
        self.movies_df.show(5)

        self.ratings_df = self._get_df_from_csv(ratings_data_path, RatingSchema)
        self.ratings_df.persist()
        self.ratings_df.printSchema()
        self.ratings_df.show(5)
        print('*' * 50)
        
        self._genres = genres
    
    def __del__(self):
        self.movies_df.unpersist()
        self.ratings_df.unpersist()
    
    @staticmethod
    def _get_df_from_csv(path, schema):
        return sqlContext.read.load(
            path=path,
            format='csv',
            schema=schema,
            header='true',
            inferSchema='false',
            sep=',',
            nullValue='null',
            mode='DROPMALFORMED'
        )
    
    def _join_movies_and_ratings(self):
        return (
            self.movies_df
                .alias('Movie')
                .join(self.ratings_df.alias('Rating'), F.col('Movie.Id') == F.col('Rating.MovieId'))
        )
        
    def print_count_of_films_by_genre(self):
        """
        $ for genre in 'Animation' 'Romance' 'Documentary' \
        > do \
        >     printf "$genre\t" && grep $genre ~/Desktop/ml_data/ml_latest_small/movies.csv | wc -l \
        > done
            Animation   611
            Romance     1599
            Documentary 440
        """
        for genre in self._genres:
            count = (
                self.movies_df
                    .filter(F.col('Genres').contains(genre))
                    .select(F.countDistinct('Id'))
                    .take(1)[0][0]
            )
            print(f'{genre} {count}')
    
    def print_top_of_films_by_rating(self, count=10):
        for genre in self._genres:
            print(genre)
            df = self._join_movies_and_ratings()
            (df
             .filter(F.col('Genres').contains(genre))
             .groupBy('Id')
             .count()
             .sort(F.desc('count'))
             .join(self.movies_df, 'Id')
             .select('Id', 'Title', 'Genres', F.col('count').alias('Ratings count'))
             .show(count))
    
    def print_losers_of_films_by_ratings_count(self, count=10, min_ratings_count=10):
        for genre in self._genres:
            print(genre)
            df = self._join_movies_and_ratings()
            (df
             .filter(F.col('Genres').contains(genre))
             .groupBy('Id')
             .count()
             .filter(f'`count` > {min_ratings_count}')
             .sort(F.asc('count'))
             .join(self.movies_df, 'Id')
             .select('Id', 'Title', 'Genres', F.col('count').alias('Ratings count'))
             .show(count))
    
    def print_top_of_films_by_rating_avg(self, count=10, min_ratings_count=10):
        return self._print_films_by_rating_avg(count, min_ratings_count)
    
    def print_losers_of_films_by_rating_avg(self, count=10, min_ratings_count=10):
        return self._print_films_by_rating_avg(count, min_ratings_count, sort_func=F.asc)
    
    def _print_films_by_rating_avg(self, count=10, min_ratings_count=10, sort_func=F.desc):
        for genre in self._genres:
            print(genre)
            df = self._join_movies_and_ratings()
            (df
             .filter(F.col('Genres').contains(genre))
             .groupBy('Id')
             .agg(
                 F.count('Rating').alias('count'),
                 F.avg('Rating').alias('avg')
             )
             .filter(f'`count` > {min_ratings_count}')
             .sort(sort_func('avg'))
             .join(self.movies_df, 'Id')
             .select(
                 'Id', 'Title', 'Genres',
                 F.col('avg').alias('Rating avg'),
                 F.col('count').alias('Ratings count')
             )
             .show(count))

In [4]:
import os

DATA_DIR = 'file:///home/cloudera/Desktop/ml_data'

small_data_analyzer = MovieAnalyzer(
    movies_data_path=os.path.join(DATA_DIR, 'ml_latest_small', 'movies.csv'),
    ratings_data_path=os.path.join(DATA_DIR, 'ml_latest_small', 'ratings.csv')
)
big_data_analyzer = MovieAnalyzer(
    movies_data_path=os.path.join(DATA_DIR, 'ml_latest_big', 'movies.csv'),
    ratings_data_path=os.path.join(DATA_DIR, 'ml_latest_big', 'ratings.csv')
)

**************************************************
root
 |-- Id: long (nullable = true)
 |-- Title: string (nullable = true)
 |-- Genres: string (nullable = true)

+---+--------------------+--------------------+
| Id|               Title|              Genres|
+---+--------------------+--------------------+
|  1|    Toy Story (1995)|Adventure|Animati...|
|  2|      Jumanji (1995)|Adventure|Childre...|
|  3|Grumpier Old Men ...|      Comedy|Romance|
|  4|Waiting to Exhale...|Comedy|Drama|Romance|
|  5|Father of the Bri...|              Comedy|
+---+--------------------+--------------------+
only showing top 5 rows

root
 |-- UserId: long (nullable = true)
 |-- MovieId: long (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Timestamp: long (nullable = true)

+------+-------+------+---------+
|UserId|MovieId|Rating|Timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   

### 1. Выведите данные, сопоставляющие жанры и количество фильмов

In [5]:
small_data_analyzer.print_count_of_films_by_genre()

Animation 611
Romance 1596
Documentary 440


In [6]:
big_data_analyzer.print_count_of_films_by_genre()

Animation 2663
Romance 7412
Documentary 5118


### 2. Выведите первые 10 фильмов с наибольшим количеством рейтингов для каждого жанра в соотвествии с вариантом 

In [7]:
small_data_analyzer.print_top_of_films_by_rating()

Animation
+-----+--------------------+--------------------+-------------+
|   Id|               Title|              Genres|Ratings count|
+-----+--------------------+--------------------+-------------+
|    1|    Toy Story (1995)|Adventure|Animati...|          215|
|  588|      Aladdin (1992)|Adventure|Animati...|          183|
|  364|Lion King, The (1...|Adventure|Animati...|          172|
| 4306|        Shrek (2001)|Adventure|Animati...|          170|
|  595|Beauty and the Be...|Animation|Childre...|          146|
| 6377| Finding Nemo (2003)|Adventure|Animati...|          141|
| 4886|Monsters, Inc. (2...|Adventure|Animati...|          132|
| 8961|Incredibles, The ...|Action|Adventure|...|          125|
|68954|           Up (2009)|Adventure|Animati...|          105|
|60069|       WALL·E (2008)|Adventure|Animati...|          104|
+-----+--------------------+--------------------+-------------+
only showing top 10 rows

Romance
+----+--------------------+--------------------+------------

In [8]:
big_data_analyzer.print_top_of_films_by_rating()

Animation
+-----+--------------------+--------------------+-------------+
|   Id|               Title|              Genres|Ratings count|
+-----+--------------------+--------------------+-------------+
|    1|    Toy Story (1995)|Adventure|Animati...|        68469|
|  588|      Aladdin (1992)|Adventure|Animati...|        51827|
|  364|Lion King, The (1...|Adventure|Animati...|        47305|
| 4306|        Shrek (2001)|Adventure|Animati...|        46826|
|  595|Beauty and the Be...|Animation|Childre...|        42400|
| 4886|Monsters, Inc. (2...|Adventure|Animati...|        37112|
| 6377| Finding Nemo (2003)|Adventure|Animati...|        37000|
| 8961|Incredibles, The ...|Action|Adventure|...|        31857|
| 3114|  Toy Story 2 (1999)|Adventure|Animati...|        29820|
|60069|       WALL·E (2008)|Adventure|Animati...|        28116|
+-----+--------------------+--------------------+-------------+
only showing top 10 rows

Romance
+----+--------------------+--------------------+------------

### 3. Выведите первые 10 фильмов с наименьшим количеством рейтингов (но больше 10) для каждого жанра в соотвествии с вариантом

In [9]:
small_data_analyzer.print_losers_of_films_by_ratings_count()

Animation
+-----+--------------------+--------------------+-------------+
|   Id|               Title|              Genres|Ratings count|
+-----+--------------------+--------------------+-------------+
| 8965|Polar Express, Th...|Adventure|Animati...|           11|
|49274|   Happy Feet (2006)|Adventure|Animati...|           11|
|97225|Hotel Transylvani...|Animation|Childre...|           11|
|52435|How the Grinch St...|Animation|Comedy|...|           11|
|65261|Ponyo (Gake no ue...|Adventure|Animati...|           11|
|  631|All Dogs Go to He...|Adventure|Animati...|           11|
|55442|   Persepolis (2007)|     Animation|Drama|           11|
|52287|Meet the Robinson...|Action|Adventure|...|           11|
|  709|Oliver & Company ...|Adventure|Animati...|           11|
|36708|Family Guy Presen...|Adventure|Animati...|           12|
+-----+--------------------+--------------------+-------------+
only showing top 10 rows

Romance
+-----+--------------------+--------------------+-----------

In [10]:
big_data_analyzer.print_losers_of_films_by_ratings_count()

Animation
+------+--------------------+--------------------+-------------+
|    Id|               Title|              Genres|Ratings count|
+------+--------------------+--------------------+-------------+
|119305|The Nutcracker Pr...|Adventure|Animati...|           11|
| 74271|Space Pirate Capt...|Action|Adventure|...|           11|
|140707|Curious George 2:...|  Animation|Children|           11|
|173293|The Mad Scientist...|    Action|Animation|           11|
|184405| Garden Party (2017)|           Animation|           11|
|107061|Urusei Yatsura Mo...|Animation|Comedy|...|           11|
|184289|Bilal: A New Bree...|Action|Adventure|...|           11|
|131132|Kleines Arschloch...|    Animation|Comedy|           11|
|113254|       Zarafa (2012)|           Animation|           11|
|145192|Alpine Climbers (...|           Animation|           11|
+------+--------------------+--------------------+-------------+
only showing top 10 rows

Romance
+------+--------------------+-----------------

### 4. Выведите первые 10 фильмов с наибольшим средним рейтингом при количестве рейтингов больше 10 для каждого жанра в соотвествии с вариантом

In [11]:
small_data_analyzer.print_top_of_films_by_rating_avg()

Animation
+-----+--------------------+--------------------+-----------------+-------------+
|   Id|               Title|              Genres|       Rating avg|Ratings count|
+-----+--------------------+--------------------+-----------------+-------------+
| 3429|Creature Comforts...|    Animation|Comedy|             4.25|           12|
|55442|   Persepolis (2007)|     Animation|Drama|4.181818181818182|           11|
| 5690|Grave of the Fire...| Animation|Drama|War|          4.15625|           16|
| 5618|Spirited Away (Se...|Adventure|Animati...|4.155172413793103|           87|
|  741|Ghost in the Shel...|    Animation|Sci-Fi|4.148148148148148|           27|
| 3213|Batman: Mask of t...|  Animation|Children|4.115384615384615|           13|
|78499|  Toy Story 3 (2010)|Adventure|Animati...|4.109090909090909|           55|
|  720|Wallace & Gromit:...|Adventure|Animati...|4.092592592592593|           27|
| 1223|Grand Day Out wit...|Adventure|Animati...|4.089285714285714|           28|
|72226

In [12]:
big_data_analyzer.print_top_of_films_by_rating_avg()

Animation
+------+--------------------+--------------------+------------------+-------------+
|    Id|               Title|              Genres|        Rating avg|Ratings count|
+------+--------------------+--------------------+------------------+-------------+
|172577|Last Year's Snow ...|Animation|Childre...| 4.261904761904762|          126|
|163809|Over the Garden W...|Adventure|Animati...| 4.244031830238727|          377|
|  5618|Spirited Away (Se...|Adventure|Animati...|4.2076678004047015|        23227|
|170777|There Once Was a ...|Animation|Childre...|       4.169921875|          256|
|138835|Return to Treasur...|Adventure|Animati...| 4.161764705882353|           68|
|  5971|My Neighbor Totor...|Animation|Childre...| 4.148407904167093|         9767|
|160718|        Piper (2016)|           Animation|4.1392657621707905|         1253|
|179173|Rabbit of Seville...|    Animation|Comedy| 4.133333333333334|           30|
|  1148|Wallace & Gromit:...|Animation|Childre...|4.12753071465651

### 5. Выведите первые 10 фильмов с наименьшим средним рейтингом при количестве рейтингов больше 10 для каждого жанра в соотвествии с вариантом

In [13]:
small_data_analyzer.print_losers_of_films_by_rating_avg()

Animation
+-----+--------------------+--------------------+------------------+-------------+
|   Id|               Title|              Genres|        Rating avg|Ratings count|
+-----+--------------------+--------------------+------------------+-------------+
| 8907|   Shark Tale (2004)|Animation|Childre...|2.3461538461538463|           13|
|69644|Ice Age: Dawn of ...|Action|Adventure|...| 2.607142857142857|           14|
|49274|   Happy Feet (2006)|Adventure|Animati...|2.6818181818181817|           11|
| 2123|All Dogs Go to He...|Animation|Childre...|               2.7|           15|
|  673|    Space Jam (1996)|Adventure|Animati...| 2.707547169811321|           53|
| 1030|Pete's Dragon (1977)|Adventure|Animati...|2.7666666666666666|           15|
| 1920|Small Soldiers (1...|Animation|Childre...|2.8333333333333335|           18|
| 1405|Beavis and Butt-H...|Adventure|Animati...| 2.935483870967742|           31|
|  239|Goofy Movie, A (1...|Animation|Childre...|               3.0|         

In [15]:
big_data_analyzer.print_losers_of_films_by_rating_avg()

Animation
+------+--------------------+--------------------+------------------+-------------+
|    Id|               Title|              Genres|        Rating avg|Ratings count|
+------+--------------------+--------------------+------------------+-------------+
|120222|   Foodfight! (2012)|Action|Animation|...|               0.8|           20|
|145096|Barbie & Her Sist...|           Animation| 1.108695652173913|           69|
|  6371|Pokémon Heroes (2...|  Animation|Children| 1.378238341968912|          386|
|151313|Norm of the North...|Adventure|Animati...|1.4142857142857144|           35|
|  5672|Pokemon 4 Ever (a...|Adventure|Animati...|1.4734042553191489|          564|
|  8811|    Yu-Gi-Oh! (2004)|Action|Adventure|...| 1.648972602739726|          292|
|175475|The Emoji Movie (...|Animation|Childre...|             1.755|          100|
|  4241|Pokémon 3: The Mo...|  Animation|Children|1.7554945054945055|          728|
|133131|Barbie Diaries (2...|  Animation|Children|1.83333333333333