# Settings

## Imports

In [13]:
import numpy as np
import pyspark

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *

## Disable safemode

In [3]:
!hdfs dfsadmin -safemode leave

2023-12-13 19:18:42 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Safe mode is OFF


# SparkSession

In [4]:
conf = SparkConf().set("spark.executor.instances", "2").set("spark.executor.cores", "1").set("spark.executor.memory", "1g")

In [5]:
spark = SparkSession.builder.master("yarn").appName("Denisov_spark").config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/13 19:18:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/13 19:18:48 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [7]:
spark

# HDFS work

In [8]:
!hdfs dfs -rm -r ml-latest-small

2023-12-13 19:24:42 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Deleted ml-latest-small


In [9]:
!hdfs dfs -ls .

2023-12-13 19:24:55 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 1 items
drwxr-xr-x   - root supergroup          0 2023-12-13 19:18 .sparkStaging


In [10]:
!hdfs dfs -put ml-latest-small .

2023-12-13 19:25:04 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [11]:
!hdfs dfs -ls .

2023-12-13 19:25:10 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
drwxr-xr-x   - root supergroup          0 2023-12-13 19:18 .sparkStaging
drwxr-xr-x   - root supergroup          0 2023-12-13 19:25 ml-latest-small


# View datasets

In [12]:
!hdfs dfs -ls ml-latest-small

2023-12-13 19:26:55 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 5 items
-rw-r--r--   3 root supergroup       8342 2023-12-13 19:25 ml-latest-small/README.txt
-rw-r--r--   3 root supergroup     197979 2023-12-13 19:25 ml-latest-small/links.csv
-rw-r--r--   3 root supergroup     494431 2023-12-13 19:25 ml-latest-small/movies.csv
-rw-r--r--   3 root supergroup    2483723 2023-12-13 19:25 ml-latest-small/ratings.csv
-rw-r--r--   3 root supergroup     118660 2023-12-13 19:25 ml-latest-small/tags.csv


In [14]:
ratings_schema = StructType(fields=[
    StructField("userId", IntegerType()),
    StructField("movieId", IntegerType()),
    StructField("rating", DoubleType()),
    StructField("timestamp", LongType()),
])

In [15]:
%%time
ratings_df = spark\
    .read\
    .format("csv")\
    .option("header", "True")\
    .schema(ratings_schema)\
    .load("ml-latest-small/ratings.csv")

CPU times: user 3.57 ms, sys: 1.46 ms, total: 5.02 ms
Wall time: 650 ms


In [16]:
ratings_df

DataFrame[userId: int, movieId: int, rating: double, timestamp: bigint]

In [17]:
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)



In [18]:
ratings_df.show(5)

                                                                                

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [20]:
ratings_df.summary().show()

23/12/13 19:28:05 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 1:>                                                          (0 + 1) / 1]

+-------+------------------+----------------+------------------+--------------------+
|summary|            userId|         movieId|            rating|           timestamp|
+-------+------------------+----------------+------------------+--------------------+
|  count|            100836|          100836|            100836|              100836|
|   mean|326.12756356856676|19435.2957177992| 3.501556983616962|1.2059460873684695E9|
| stddev| 182.6184914635004|35530.9871987003|1.0425292390606342|2.1626103599513078E8|
|    min|                 1|               1|               0.5|           828124615|
|    25%|               177|            1199|               3.0|          1018535155|
|    50%|               325|            2991|               3.5|          1186086516|
|    75%|               477|            8092|               4.0|          1435993828|
|    max|               610|          193609|               5.0|          1537799250|
+-------+------------------+----------------+---------

                                                                                

In [21]:
ratings_df.count() # 2 stages, 2 tasks

                                                                                

100836

In [22]:
%%time
tags_df = spark.read.format("csv").option("header", "True").load("ml-latest-small/tags.csv")

CPU times: user 10.4 ms, sys: 1.85 ms, total: 12.3 ms
Wall time: 320 ms


In [23]:
tags_df.count() # 2 stages, 2 tasks

3683

# Stop spark session

In [12]:
spark.stop()