In [1]:
import pandas as pd
import numpy as np
import datetime

## Первая часть практики Python

### 1. Загрузка данных и создание датафреймов ratings и movies

In [2]:
# список имен столбцов датафрейма ratings
ratings_names = ['user_id', 'movie_id', 'rating', 'timestamp']

In [3]:
ratings = pd.read_csv('datas_abd/u.data.csv',
                      names = ratings_names,
                      sep = '\t')

In [4]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
# список имен столбцов датафрейма movies
movies_names = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
                'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
                'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir',
                'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western']

In [6]:
movies = pd.read_csv('datas_abd/u.item.csv',
                     names = movies_names,
                     index_col = 0,
                     sep = '|',
                     encoding = 'latin-1'
                    )

In [7]:
movies.head()

Unnamed: 0_level_0,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,Comedy,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### 2. Ищем id пользователя, поставившего больше всего оценок

In [8]:
ratings.groupby('user_id').count().sort_values('movie_id', ascending=False).head(1)

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
405,737,737,737


Пользователь с **id = 405** поставил 737 оценок, что является наибольшим значением среди всех пользователей.

### 3. Создадим датафрейм ratings_405 с фильмами, которые оценил пользователь с id = 405

In [9]:
ratings_405 = ratings.loc[ratings['user_id'] == 405].sort_values('movie_id')
ratings_405

Unnamed: 0,user_id,movie_id,rating,timestamp
25105,405,2,1,885547953
68151,405,4,4,885547314
33738,405,5,4,885545070
44158,405,8,4,885545015
13347,405,11,4,885545263
...,...,...,...,...
26243,405,1588,1,885549789
22974,405,1589,1,885549745
26303,405,1590,1,885549789
76286,405,1591,1,885549943


Получили 737 фильмов, которые оценил пользователь с di = 405

### 4. Добавляем к датафрейму новые столбцы

#### 4.1 Добавляем столбцы с жанрами

Для этого обогащаем датафрейм ratings_405 данными из датафрейма movies путем объединения таблиц

In [10]:
joined_ratings_405 = ratings_405.merge(movies, on='movie_id', how='left')
joined_ratings_405.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,405,2,1,885547953,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,...,0,0,0,0,0,0,0,1,0,0
1,405,4,4,885547314,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,...,0,0,0,0,0,0,0,0,0,0
2,405,5,4,885545070,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,...,0,0,0,0,0,0,0,1,0,0
3,405,8,4,885545015,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,...,0,0,0,0,0,0,0,0,0,0
4,405,11,4,885545263,Seven (Se7en) (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Se7en%20(1995),0,0,...,0,0,0,0,0,0,0,1,0,0


#### 4.2 Добавляем столбцы с общим количеством оценок от всех пользователей на фильм и суммарной оценкой от всех пользователей

Для этого сгруппируем датафрейм rating по признаку movie_id и посчитаем количество оценок и суммарную оценку для каждого фильма

In [11]:
grouped_ratings = ratings.groupby('movie_id').agg({'rating': ['count', 'sum']})
grouped_ratings.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,count,sum
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,1753
2,131,420
3,90,273
4,209,742
5,86,284


Обогатим датафрейм joined_ratings_405 значениями датафрейма grouped_ratings

In [12]:
result_df = joined_ratings_405.merge(grouped_ratings, on='movie_id', how='left')
result_df

  result_df = joined_ratings_405.merge(grouped_ratings, on='movie_id', how='left')


Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,...,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western,"(rating, count)","(rating, sum)"
0,405,2,1,885547953,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,...,0,0,0,0,0,1,0,0,131,420
1,405,4,4,885547314,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,...,0,0,0,0,0,0,0,0,209,742
2,405,5,4,885545070,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,...,0,0,0,0,0,1,0,0,86,284
3,405,8,4,885545015,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,...,0,0,0,0,0,0,0,0,219,875
4,405,11,4,885545263,Seven (Se7en) (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Se7en%20(1995),0,0,...,0,0,0,0,0,1,0,0,236,908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732,405,1588,1,885549789,Salut cousin! (1996),21-Feb-1997,,http://us.imdb.com/M/title-exact?Salut%20cousi...,0,0,...,0,0,0,0,0,0,0,0,2,4
733,405,1589,1,885549745,Schizopolis (1996),23-May-1997,,http://us.imdb.com/Title?Schizopolis+(1996),0,0,...,0,0,0,0,0,0,0,0,4,12
734,405,1590,1,885549789,"To Have, or Not (1995)",06-Jun-1997,,http://us.imdb.com/M/title-exact?En%20avoir%20...,0,0,...,0,0,0,0,0,0,0,0,2,4
735,405,1591,1,885549943,Duoluo tianshi (1995),21-Jan-1998,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,...,0,0,0,0,0,0,0,0,6,19


Добавим столбец с годом выхода фильма

In [13]:
result_df['release_date'] = result_df['release_date'].astype("datetime64[ns]")

In [14]:
result_df["release_year"] = result_df['release_date'].dt.to_period("Y")

In [15]:
result_df['release_year_int'] = result_df['release_date'].dt.strftime('%Y')

In [16]:
result_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,...,Mystery,Romance,Sci_Fi,Thriller,War,Western,"(rating, count)","(rating, sum)",release_year,release_year_int
0,405,2,1,885547953,GoldenEye (1995),1995-01-01,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,...,0,0,0,1,0,0,131,420,1995,1995
1,405,4,4,885547314,Get Shorty (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,...,0,0,0,0,0,0,209,742,1995,1995
2,405,5,4,885545070,Copycat (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,...,0,0,0,1,0,0,86,284,1995,1995
3,405,8,4,885545015,Babe (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,...,0,0,0,0,0,0,219,875,1995,1995
4,405,11,4,885545263,Seven (Se7en) (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Se7en%20(1995),0,0,...,0,0,0,1,0,0,236,908,1995,1995


Оставим в датафрейме необходимые для построения модели признаки:
- Год выхода
- Жанры
- Общее количество оценок
- Суммарную оценку

In [17]:
result_df.columns

Index([           'user_id',           'movie_id',             'rating',
                'timestamp',        'movie_title',       'release_date',
       'video_release_date',           'IMDb_URL',            'unknown',
                   'Action',          'Adventure',          'Animation',
                 'Children',             'Comedy',              'Crime',
              'Documentary',              'Drama',            'Fantasy',
                'Film_Noir',             'Horror',            'Musical',
                  'Mystery',            'Romance',             'Sci_Fi',
                 'Thriller',                'War',            'Western',
        ('rating', 'count'),    ('rating', 'sum'),       'release_year',
         'release_year_int'],
      dtype='object')

In [18]:
result_df = result_df[['movie_id', 'release_year_int', 'rating', ('rating', 'count'), ('rating', 'sum'),
                       'unknown', 'Action', 'Adventure', 'Animation',
                       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                       'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi',
                       'Thriller', 'War', 'Western']]

In [19]:
result_df = result_df.rename(columns = {('rating', 'count'):'rating_count', ('rating', 'sum'):'rating_sum'})

Посмотрим, что в результате получилось:

In [20]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 737 entries, 0 to 736
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   movie_id          737 non-null    int64 
 1   release_year_int  737 non-null    object
 2   rating            737 non-null    int64 
 3   rating_count      737 non-null    int64 
 4   rating_sum        737 non-null    int64 
 5   unknown           737 non-null    int64 
 6   Action            737 non-null    int64 
 7   Adventure         737 non-null    int64 
 8   Animation         737 non-null    int64 
 9   Children          737 non-null    int64 
 10  Comedy            737 non-null    int64 
 11  Crime             737 non-null    int64 
 12  Documentary       737 non-null    int64 
 13  Drama             737 non-null    int64 
 14  Fantasy           737 non-null    int64 
 15  Film_Noir         737 non-null    int64 
 16  Horror            737 non-null    int64 
 17  Musical         

Изменим тип значений в поле 'release_year_int'

In [21]:
result_df['release_year_int'] = result_df['release_year_int'].astype(int)

In [22]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 737 entries, 0 to 736
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   movie_id          737 non-null    int64
 1   release_year_int  737 non-null    int64
 2   rating            737 non-null    int64
 3   rating_count      737 non-null    int64
 4   rating_sum        737 non-null    int64
 5   unknown           737 non-null    int64
 6   Action            737 non-null    int64
 7   Adventure         737 non-null    int64
 8   Animation         737 non-null    int64
 9   Children          737 non-null    int64
 10  Comedy            737 non-null    int64
 11  Crime             737 non-null    int64
 12  Documentary       737 non-null    int64
 13  Drama             737 non-null    int64
 14  Fantasy           737 non-null    int64
 15  Film_Noir         737 non-null    int64
 16  Horror            737 non-null    int64
 17  Musical           737 non-null    i

In [23]:
result_df.groupby('rating').agg({'movie_id':'count'})

Unnamed: 0_level_0,movie_id
rating,Unnamed: 1_level_1
1,485
2,73
3,63
4,48
5,68


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics

### 5. Формируем обучающую и тестовую выборки

In [25]:
data = result_df[['release_year_int', 'rating_count', 'rating_sum',
                       'unknown', 'Action', 'Adventure', 'Animation',
                       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                       'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi',
                       'Thriller', 'War', 'Western']].to_numpy()

In [26]:
data

array([[1995,  131,  420, ...,    1,    0,    0],
       [1995,  209,  742, ...,    0,    0,    0],
       [1995,   86,  284, ...,    1,    0,    0],
       ...,
       [1997,    2,    4, ...,    0,    0,    0],
       [1998,    6,   19, ...,    0,    0,    0],
       [1998,    5,   18, ...,    0,    0,    0]])

In [27]:
data_class = result_df['rating'].to_numpy()

In [28]:
data_class

array([1, 4, 4, 4, 4, 5, 5, 5, 3, 1, 4, 4, 1, 1, 1, 1, 2, 2, 1, 5, 1, 2,
       1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1, 1, 2, 2, 1, 4, 1, 1, 1, 1, 1, 1,
       3, 5, 1, 5, 5, 1, 4, 3, 1, 3, 5, 2, 3, 1, 2, 5, 1, 3, 4, 1, 4, 1,
       1, 3, 1, 4, 2, 1, 5, 3, 3, 2, 4, 5, 1, 1, 1, 5, 5, 5, 3, 3, 2, 1,
       5, 1, 1, 1, 1, 1, 1, 5, 5, 5, 1, 1, 1, 3, 1, 3, 5, 1, 1, 1, 4, 5,
       5, 1, 1, 2, 4, 5, 4, 1, 5, 1, 4, 2, 1, 2, 1, 4, 1, 5, 3, 1, 1, 5,
       3, 5, 1, 1, 2, 4, 5, 2, 1, 5, 5, 2, 1, 1, 1, 2, 3, 4, 1, 5, 5, 3,
       1, 2, 5, 4, 1, 1, 4, 4, 5, 1, 4, 1, 1, 5, 5, 2, 1, 1, 3, 1, 1, 1,
       2, 1, 1, 5, 1, 4, 1, 2, 1, 1, 1, 3, 1, 3, 1, 4, 2, 1, 5, 4, 3, 1,
       4, 1, 1, 1, 1, 3, 5, 4, 1, 2, 2, 2, 5, 4, 5, 1, 1, 5, 2, 1, 5, 1,
       5, 1, 3, 3, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 4, 3, 4, 1, 4, 4, 1, 1,
       5, 5, 3, 3, 2, 1, 1, 1, 1, 4, 3, 1, 1, 4, 3, 3, 2, 1, 1, 2, 1, 1,
       1, 1, 1, 3, 1, 2, 2, 4, 1, 2, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1,

In [29]:
X_train, X_test, y_train, y_test = train_test_split(data, data_class, random_state=42)

### 6. Обучаем модель

#### 6.1 Модель логистической регрессии

In [30]:
lr = LogisticRegression()

In [31]:
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 6.2 Модель линейной регрессии

In [32]:
lm = LinearRegression()

In [33]:
lm.fit(X_train, y_train)

### 7. Оценка качества модели

#### 7.1 Оцетка качества модели логичтической регрессии

In [34]:
y_pred = lr.predict(X_test)

In [35]:
accuracy_score(y_test, y_pred)

0.6486486486486487

#### 7.2 Оценка качества модели линейной регрессии

In [36]:
# Делаем прогноз целевой переменной y_pred:
y_pred = lm.predict(X_test)

In [37]:
# Рассчитаем коэффициент детерминации
print(f'R2: {metrics.r2_score(y_test, y_pred)}')

R2: 0.3411888950363978


**Вывод:** В наших данных присутствует проблема сбалансированности выборки ('1' составляет более половины оценок пользователя), это могло повлиять на качество построенных моделей.

## Вторая часть практики Python

In [40]:
#!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=35daa974142cee32e088e92e463bc2c3245b85bd0fe345c82a4d0ea517bb34f8
  Stored in directory: /home/andrew/.cache/pip/wheels/38/df/61/8c121f50c3cffd77f8178180dd232d90b3b99d1bd61fb6d6be
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4

In [127]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import sum,count

In [43]:
spark = SparkSession.builder.getOrCreate()

In [44]:
spark

### 8. Загружаем данные в Spark

In [55]:
ratings_df = spark.read.csv('datas_abd/u.data.csv', inferSchema=True, header=False, sep='\t')

In [58]:
ratings_df = ratings_df.toDF('user_id', 'movie_id', 'rating', 'timestamp')

In [60]:
ratings_df.show(5)

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    196|     242|     3|881250949|
|    186|     302|     3|891717742|
|     22|     377|     1|878887116|
|    244|      51|     2|880606923|
|    166|     346|     1|886397596|
+-------+--------+------+---------+
only showing top 5 rows



In [61]:
movies_df = spark.read.csv('datas_abd/u.item.csv', inferSchema=True, header=False, sep='|')

In [65]:
movies_df = movies_df.toDF('movie_id', 'movie_title', 'release_date', 'video_release_date',
                'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
                'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir',
                'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western')

In [66]:
movies_df.show(1, vertical=True)

-RECORD 0----------------------------------
 movie_id           | 1                    
 movie_title        | Toy Story (1995)     
 release_date       | 01-Jan-1995          
 video_release_date | null                 
 IMDb_URL           | http://us.imdb.co... 
 unknown            | 0                    
 Action             | 0                    
 Adventure          | 0                    
 Animation          | 1                    
 Children           | 1                    
 Comedy             | 1                    
 Crime              | 0                    
 Documentary        | 0                    
 Drama              | 0                    
 Fantasy            | 0                    
 Film_Noir          | 0                    
 Horror             | 0                    
 Musical            | 0                    
 Mystery            | 0                    
 Romance            | 0                    
 Sci_Fi             | 0                    
 Thriller           | 0         

### 9. Выводим среднюю оценку для каждого фильма

In [83]:
movie_avg = ratings_df.groupBy('movie_id').avg('rating')
movies_titles = movies_df.select('movie_id', 'movie_title')
movie_avg = movie_avg.join(movies_titles, 'movie_id', how='left').orderBy(F.col('avg(rating)').desc())
movie_avg.show()

+--------+-----------------+--------------------+
|movie_id|      avg(rating)|         movie_title|
+--------+-----------------+--------------------+
|    1467|              5.0|Saint of Fort Was...|
|    1653|              5.0|Entertaining Ange...|
|    1500|              5.0|Santa with Muscle...|
|    1201|              5.0|Marlene Dietrich:...|
|    1599|              5.0|Someone Else's Am...|
|    1122|              5.0|They Made Me a Cr...|
|    1189|              5.0|  Prefontaine (1997)|
|    1293|              5.0|     Star Kid (1997)|
|    1536|              5.0|Aiqing wansui (1994)|
|     814|              5.0|Great Day in Harl...|
|    1449|            4.625|Pather Panchali (...|
|    1642|              4.5|Some Mother's Son...|
|    1398|              4.5|         Anna (1996)|
|    1594|              4.5|      Everest (1998)|
|     119|              4.5|Maya Lin: A Stron...|
|     408|4.491071428571429|Close Shave, A (1...|
|     318|4.466442953020135|Schindler's List ...|


### 10. Считаем сруднюю оценку для каждого жанра

In [113]:
joined_df = movies_df.join(ratings_df, 'movie_id', how='left')

In [111]:
genres = ['unknown', 'Action', 'Adventure', 'Animation', 'Children',
          'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir',
          'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western']

In [112]:
for i in genres:
    joined_df.where((joined_df[i] == 1)).groupBy(i).avg('rating').show()

+-------+-----------+
|unknown|avg(rating)|
+-------+-----------+
|      1|        3.2|
+-------+-----------+

+------+-----------------+
|Action|      avg(rating)|
+------+-----------------+
|     1|3.480245417953027|
+------+-----------------+

+---------+-----------------+
|Adventure|      avg(rating)|
+---------+-----------------+
|        1|3.503526503308369|
+---------+-----------------+

+---------+------------------+
|Animation|       avg(rating)|
+---------+------------------+
|        1|3.5766990291262135|
+---------+------------------+

+--------+------------------+
|Children|       avg(rating)|
+--------+------------------+
|       1|3.3532442216652742|
+--------+------------------+

+------+------------------+
|Comedy|       avg(rating)|
+------+------------------+
|     1|3.3940734781442745|
+------+------------------+

+-----+------------------+
|Crime|       avg(rating)|
+-----+------------------+
|    1|3.6322780881440098|
+-----+------------------+

+-----------+-----

### 11. Датафреймы top_5 самыx популярныx и самыx непопулярных фильмов (по сумме баллов и по количеству оценок)

In [138]:
top5_max_rating_sum = joined_df.groupBy('movie_title').agg(sum('rating').alias('rating_sum'),\
                                    count('rating').alias('rating_count'))\
                                .orderBy(F.col('rating_sum').desc())
top5_max_rating_sum.show(5)

+--------------------+----------+------------+
|         movie_title|rating_sum|rating_count|
+--------------------+----------+------------+
|    Star Wars (1977)|      2541|         583|
|        Fargo (1996)|      2111|         508|
|Return of the Jed...|      2032|         507|
|      Contact (1997)|      1936|         509|
|Raiders of the Lo...|      1786|         420|
+--------------------+----------+------------+
only showing top 5 rows



In [140]:
top5_max_rating_count = joined_df.groupBy('movie_title').agg(sum('rating').alias('rating_sum'),\
                                    count('rating').alias('rating_count'))\
                                .orderBy(F.col('rating_count').desc())
top5_max_rating_count.show(5)

+--------------------+----------+------------+
|         movie_title|rating_sum|rating_count|
+--------------------+----------+------------+
|    Star Wars (1977)|      2541|         583|
|      Contact (1997)|      1936|         509|
|        Fargo (1996)|      2111|         508|
|Return of the Jed...|      2032|         507|
|    Liar Liar (1997)|      1531|         485|
+--------------------+----------+------------+
only showing top 5 rows



In [141]:
top5_min_rating_sum = joined_df.groupBy('movie_title').agg(sum('rating').alias('rating_sum'),\
                                    count('rating').alias('rating_count'))\
                                .orderBy('rating_sum')
top5_min_rating_sum.show(5)

+--------------------+----------+------------+
|         movie_title|rating_sum|rating_count|
+--------------------+----------+------------+
|Leopard Son, The ...|         1|           1|
|Girl in the Cadil...|         1|           1|
|Lashou shentan (1...|         1|           1|
|Vie est belle, La...|         1|           1|
|Tigrero: A Film T...|         1|           1|
+--------------------+----------+------------+
only showing top 5 rows

