In [1]:
from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [2]:
# 启动 Spark （如果你已经启动就不需要）
spark = SparkSession.builder.master("local[2]") \
   .appName("test") \
   .enableHiveSupport() \
   .getOrCreate()

sc = spark.sparkContext

## save as csv format

In [1]:
df_csv = spark.read.csv("../data/ratings.csv", header=True)

In [2]:
df_csv.show()

+------+-------+------+----------+
|userid|movieid|rating|        ts|
+------+-------+------+----------+
|     3|   6539|     5|1133571238|
|     3|   7153|     4|1133571171|
|     3|   7155|   3.5|1164885564|
|     3|   8529|     4|1136075616|
|     3|   8533|   4.5|1136418593|
|     3|   8783|     5|1136075857|
|     3|  27821|   4.5|1136418616|
|     3|  33750|   3.5|1164885688|
|     3|  33750|   3.5|1164887688|
|     3|    344|  null| 844416742|
|     4|     21|     3| 844416980|
|     4|     34|     5| 844416936|
|     4|     39|     3| 844417037|
|     4|    110|     5| 844416866|
|     4|    150|     5| 844416656|
|     4|    153|     5| 844416699|
|     4|    161|     5| 844416835|
|     4|    165|     5| 844416699|
|     4|    208|     3| 844416866|
|     4|    231|     1| 844416742|
+------+-------+------+----------+
only showing top 20 rows



In [5]:
df_csv.write.parquet('../output/ratings_v1.parquet')

In [6]:
df_csv.write.csv("../output/ratings_1.csv", header=True, mode='overwrite')  # mode="error"  | mode="append"

In [7]:
!ls -lh ../output/ratings_1.csv/

total 4.0K
-rw-r--r--. 1 root root 561 Nov  9 15:39 part-00000-18cd677a-f297-48e9-ae27-ec64786b4802-c000.csv
-rw-r--r--. 1 root root   0 Nov  9 15:39 _SUCCESS


In [8]:
!ls ../output/ratings_1.csv

part-00000-18cd677a-f297-48e9-ae27-ec64786b4802-c000.csv  _SUCCESS


In [10]:
!head ../output/ratings_1.csv/part-00000-18cd677a-f297-48e9-ae27-ec64786b4802-c000.csv 

userid,movieid,rating,ts
3,6539,5,1133571238
3,7153,4,1133571171
3,7155,3.5,1164885564
3,8529,4,1136075616
3,8533,4.5,1136418593
3,8783,5,1136075857
3,27821,4.5,1136418616
3,33750,3.5,1164885688
3,33750,3.5,1164887688


In [11]:
df_csv.coalesce(1).write.csv("../output/ratings_coalease_1.csv", header=True)   # coalesce让每个node里只剩一个档案

In [12]:
!ls -lh ../output/ratings_coalease_1.csv/

total 4.0K
-rw-r--r--. 1 root root 561 Nov  9 15:41 part-00000-98a940bf-efbc-4d4f-96d9-1c19b155d42d-c000.csv
-rw-r--r--. 1 root root   0 Nov  9 15:41 _SUCCESS


## save as json

In [13]:
df_csv.write.json("../output/ratings.json", mode='overwrite')

In [14]:
!ls ../output/ratings.json/

part-00000-56b79bcb-79c1-48a2-a02e-0f797f9b46b6-c000.json  _SUCCESS


In [18]:
!head ../output/ratings.json/part-00000-56b79bcb-79c1-48a2-a02e-0f797f9b46b6-c000.json

{"userid":"3","movieid":"6539","rating":"5","ts":"1133571238"}
{"userid":"3","movieid":"7153","rating":"4","ts":"1133571171"}
{"userid":"3","movieid":"7155","rating":"3.5","ts":"1164885564"}
{"userid":"3","movieid":"8529","rating":"4","ts":"1136075616"}
{"userid":"3","movieid":"8533","rating":"4.5","ts":"1136418593"}
{"userid":"3","movieid":"8783","rating":"5","ts":"1136075857"}
{"userid":"3","movieid":"27821","rating":"4.5","ts":"1136418616"}
{"userid":"3","movieid":"33750","rating":"3.5","ts":"1164885688"}
{"userid":"3","movieid":"33750","rating":"3.5","ts":"1164887688"}
{"userid":"3","movieid":"344","ts":"844416742"}


In [19]:
!ls -lh ../output/ratings.json/

total 4.0K
-rw-r--r--. 1 root root 1.7K Nov  9 15:47 part-00000-56b79bcb-79c1-48a2-a02e-0f797f9b46b6-c000.json
-rw-r--r--. 1 root root    0 Nov  9 15:47 _SUCCESS


In [21]:
!ls -lh ../output/ratings_1.csv/

total 4.0K
-rw-r--r--. 1 root root 561 Nov  9 15:39 part-00000-18cd677a-f297-48e9-ae27-ec64786b4802-c000.csv
-rw-r--r--. 1 root root   0 Nov  9 15:39 _SUCCESS


## save as parquet    !!Recomend

In [22]:
df_csv.write.parquet("../output/ratings.parquet", mode = 'overwrite')

In [23]:
!ls -lh ../output/ratings.parquet/

total 4.0K
-rw-r--r--. 1 root root 1.4K Nov  9 15:49 part-00000-b14d0cc3-970f-4c4a-b843-cb82dc183dee-c000.snappy.parquet
-rw-r--r--. 1 root root    0 Nov  9 15:49 _SUCCESS


## Compression
bzip2, gzip, lz4, snappy and deflate

In [24]:
df_csv.write.csv("../output/ratings_gzip.csv", header=True, compression='gzip', mode='overwrite')

In [25]:
!ls -lh ../output/ratings_gzip.csv

total 4.0K
-rw-r--r--. 1 root root 243 Nov  9 15:49 part-00000-055ea04d-6831-4a4e-b676-7bd165a5ed87-c000.csv.gz
-rw-r--r--. 1 root root   0 Nov  9 15:49 _SUCCESS


## 存成 table

In [26]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [27]:
df_csv.write.saveAsTable('ratings_csv')

In [28]:
spark.sql("show tables").show()

+--------+-----------+-----------+
|database|  tableName|isTemporary|
+--------+-----------+-----------+
| default|ratings_csv|      false|
+--------+-----------+-----------+



In [29]:
spark.sql("select * from ratings_csv").show()  # spark跟hive连接, 通过spark存取hive资源, 加快hive(因为原本的hive用的是传统的mapreduce, 每次迭代都在disk上, 因此通过spark可以加快速度 )速度 


+------+-------+------+----------+
|userid|movieid|rating|        ts|
+------+-------+------+----------+
|     3|   6539|     5|1133571238|
|     3|   7153|     4|1133571171|
|     3|   7155|   3.5|1164885564|
|     3|   8529|     4|1136075616|
|     3|   8533|   4.5|1136418593|
|     3|   8783|     5|1136075857|
|     3|  27821|   4.5|1136418616|
|     3|  33750|   3.5|1164885688|
|     3|  33750|   3.5|1164887688|
|     3|    344|  null| 844416742|
|     4|     21|     3| 844416980|
|     4|     34|     5| 844416936|
|     4|     39|     3| 844417037|
|     4|    110|     5| 844416866|
|     4|    150|     5| 844416656|
|     4|    153|     5| 844416699|
|     4|    161|     5| 844416835|
|     4|    165|     5| 844416699|
|     4|    208|     3| 844416866|
|     4|    231|     1| 844416742|
+------+-------+------+----------+
only showing top 20 rows

