In [60]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col, lower

## Задача
С помощью Spark разбейте названия фильмов на отдельные слова и посчитайте, какое слово встречается чаще всего.

## Решение с помощью pandas

In [33]:
df = pd.read_csv('movies.csv')
(
    df.Film
    .apply(lambda x: x.split())
    .explode()
    .str.lower()
    .to_frame()
    .groupby('Film')['Film']
    .count()
    .sort_values(ascending=False)
    .head(1)
)

Film
the    14
Name: Film, dtype: int64

## Решение с помощью spark

In [35]:
spark = SparkSession.builder.appName("hw_9_solution_khin").getOrCreate()

In [39]:
df = spark.read.option("header", "true").csv("movies.csv")

25/01/08 02:05:21 INFO InMemoryFileIndex: It took 1 ms to list leaf files for 1 paths.
25/01/08 02:05:21 INFO InMemoryFileIndex: It took 2 ms to list leaf files for 1 paths.
25/01/08 02:05:21 INFO FileSourceStrategy: Pushed Filters: 
25/01/08 02:05:21 INFO FileSourceStrategy: Post-Scan Filters: (length(trim(value#75, None)) > 0)
25/01/08 02:05:21 INFO MemoryStore: Block broadcast_5 stored as values in memory (estimated size 352.3 KiB, free 364.8 MiB)
25/01/08 02:05:21 INFO MemoryStore: Block broadcast_5_piece0 stored as bytes in memory (estimated size 34.9 KiB, free 364.8 MiB)
25/01/08 02:05:21 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on 172.20.10.2:65226 (size: 34.9 KiB, free: 366.2 MiB)
25/01/08 02:05:21 INFO SparkContext: Created broadcast 5 from csv at NativeMethodAccessorImpl.java:0
25/01/08 02:05:21 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4199377 bytes, open cost is considered as scanning 4194304 bytes.
25/01/08 02:05:21 INFO SparkConte

In [44]:
df.show()

+--------------------+---------+--------------------+----------------+-------------+-----------------+---------------+----+
|                Film|    Genre|         Lead Studio|Audience score %|Profitability|Rotten Tomatoes %|Worldwide Gross|Year|
+--------------------+---------+--------------------+----------------+-------------+-----------------+---------------+----+
|Zack and Miri Mak...|  Romance|The Weinstein Com...|              70|  1.747541667|               64|        $41.94 |2008|
|     Youth in Revolt|   Comedy|The Weinstein Com...|              52|         1.09|               68|        $19.62 |2010|
|You Will Meet a T...|   Comedy|         Independent|              35|  1.211818182|               43|        $26.66 |2010|
|        When in Rome|   Comedy|              Disney|              44|            0|               15|        $43.04 |2010|
|What Happens in V...|   Comedy|                 Fox|              72|  6.267647029|               28|       $219.37 |2008|
| Water 

25/01/08 02:07:40 INFO FileSourceStrategy: Pushed Filters: 
25/01/08 02:07:40 INFO FileSourceStrategy: Post-Scan Filters: 
25/01/08 02:07:40 INFO MemoryStore: Block broadcast_14 stored as values in memory (estimated size 352.2 KiB, free 365.1 MiB)
25/01/08 02:07:40 INFO MemoryStore: Block broadcast_14_piece0 stored as bytes in memory (estimated size 34.9 KiB, free 365.1 MiB)
25/01/08 02:07:40 INFO BlockManagerInfo: Added broadcast_14_piece0 in memory on 172.20.10.2:65226 (size: 34.9 KiB, free: 366.2 MiB)
25/01/08 02:07:40 INFO SparkContext: Created broadcast 14 from showString at NativeMethodAccessorImpl.java:0
25/01/08 02:07:40 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4199377 bytes, open cost is considered as scanning 4194304 bytes.
25/01/08 02:07:40 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/01/08 02:07:40 INFO DAGScheduler: Got job 6 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/01/08 02:0

In [65]:
words_df = df.select(explode(split(col("Film"), "\\s+")).alias("word"))
words_df = words_df.withColumn("word", lower(col("word")))
word_counts = words_df.groupBy("word").count()
most_common_word = word_counts.orderBy(col("count").desc()).first()

25/01/08 02:14:23 INFO FileSourceStrategy: Pushed Filters: 
25/01/08 02:14:23 INFO FileSourceStrategy: Post-Scan Filters: 
25/01/08 02:14:23 INFO CodeGenerator: Code generated in 25.676417 ms
25/01/08 02:14:23 INFO MemoryStore: Block broadcast_22 stored as values in memory (estimated size 352.2 KiB, free 365.2 MiB)
25/01/08 02:14:23 INFO MemoryStore: Block broadcast_22_piece0 stored as bytes in memory (estimated size 34.9 KiB, free 365.1 MiB)
25/01/08 02:14:23 INFO BlockManagerInfo: Added broadcast_22_piece0 in memory on 172.20.10.2:65226 (size: 34.9 KiB, free: 366.2 MiB)
25/01/08 02:14:23 INFO SparkContext: Created broadcast 22 from first at /var/folders/8x/py78bv316g35sfyx4zxjd9vm0000gn/T/ipykernel_19413/2443885891.py:4
25/01/08 02:14:23 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4199377 bytes, open cost is considered as scanning 4194304 bytes.
25/01/08 02:14:23 INFO DAGScheduler: Registering RDD 55 (first at /var/folders/8x/py78bv316g35sfyx4zxjd9vm0000gn/T/ip

In [66]:
most_common_word

Row(word='the', count=14)