In [1]:
from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [6]:
# 启动 Spark （如果你已经启动就不需要）
spark = SparkSession.builder.master("local[2]") \
   .appName("test") \
   .getOrCreate()

sc = spark.sparkContext

## 讀取 csv file

In [7]:
!ls -lh ../data/

total 196M
-rw-r--r--. 1 root root  188 Nov 10  2017 json_example.json
-rw-r--r--. 1 root root 4.5K Dec 27  2017 kmeans_data.txt
-rw-r--r--. 1 root root 196M Nov  5 16:28 NASA_access_log_Jul95
-rw-r--r--. 1 root root  11K Nov 10  2017 NASA_access_log_Jul95_100
-rw-r--r--. 1 root root  561 Nov 10  2017 ratings.csv
-rw-r--r--. 1 root root 103K Dec 27  2017 sample_libsvm_data.txt
-rw-r--r--. 1 root root 2.7K Nov 10  2017 shakespear.txt
-rw-r--r--. 1 root root  28K Dec 27  2017 titanic_test.csv
-rw-r--r--. 1 root root  60K Dec 27  2017 titanic_train.csv


In [8]:
!head ../data/ratings.csv

userid,movieid,rating,ts
3,6539,5,1133571238
3,7153,4,1133571171
3,7155,3.5,1164885564
3,8529,4,1136075616
3,8533,4.5,1136418593
3,8783,5,1136075857
3,27821,4.5,1136418616
3,33750,3.5,1164885688
3,33750,3.5,1164887688


In [18]:
df_csv = spark.read.csv("../data/ratings.csv",header=True)

In [19]:
df_csv.show()

+------+-------+------+----------+
|userid|movieid|rating|        ts|
+------+-------+------+----------+
|     3|   6539|     5|1133571238|
|     3|   7153|     4|1133571171|
|     3|   7155|   3.5|1164885564|
|     3|   8529|     4|1136075616|
|     3|   8533|   4.5|1136418593|
|     3|   8783|     5|1136075857|
|     3|  27821|   4.5|1136418616|
|     3|  33750|   3.5|1164885688|
|     3|  33750|   3.5|1164887688|
|     3|    344|  null| 844416742|
|     4|     21|     3| 844416980|
|     4|     34|     5| 844416936|
|     4|     39|     3| 844417037|
|     4|    110|     5| 844416866|
|     4|    150|     5| 844416656|
|     4|    153|     5| 844416699|
|     4|    161|     5| 844416835|
|     4|    165|     5| 844416699|
|     4|    208|     3| 844416866|
|     4|    231|     1| 844416742|
+------+-------+------+----------+
only showing top 20 rows



In [20]:
df_csv.printSchema()

root
 |-- userid: string (nullable = true)
 |-- movieid: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- ts: string (nullable = true)



In [21]:
df_csv.describe().show()

+-------+------------------+-----------------+------------------+-------------------+
|summary|            userid|          movieid|            rating|                 ts|
+-------+------------------+-----------------+------------------+-------------------+
|  count|                29|               28|                27|                 28|
|   mean|3.6551724137931036|5217.678571428572| 4.018518518518518|9.410967772142857E8|
| stddev|0.4837252813149749|9923.353021491952|1.1135656648929557|1.432772681721757E8|
|    min|                 3|              110|                 1|         1133571171|
|    max|                 4|             8783|                 5|          844417070|
+-------+------------------+-----------------+------------------+-------------------+



## 讀取 Json

In [22]:
!head ../data/json_example.json

{"userid": '1', "rating": 4, "movieid": '001'}
{"userid": '1', "rating": 3, "movieid": '002'}
{"userid": '2', "movieid": '001', "rating": 4}
{"userid": '2', "movieid": '003', "rating": 2}


In [23]:
df_json = spark.read.json("../data/json_example.json")

In [24]:
df_json.show()

+-------+------+------+
|movieid|rating|userid|
+-------+------+------+
|    001|     4|     1|
|    002|     3|     1|
|    001|     4|     2|
|    003|     2|     2|
+-------+------+------+



In [25]:
df_json.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieid|            rating|            userid|
+-------+------------------+------------------+------------------+
|  count|                 4|                 4|                 4|
|   mean|              1.75|              3.25|               1.5|
| stddev|0.9574271077563381|0.9574271077563381|0.5773502691896257|
|    min|               001|                 2|                 1|
|    max|               003|                 4|                 2|
+-------+------------------+------------------+------------------+



In [26]:
df_json.printSchema()

root
 |-- movieid: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- userid: string (nullable = true)



## 讀取 Parquet

In [27]:
!ls -lh ../data/ratings.parquet

ls: cannot access ../data/ratings.parquet/: No such file or directory


In [28]:
!head ../data/ratings.parquet/part-00003-1e0c22a0-279b-437f-85c0-4009c4b4e16f-c000.snappy.parquet

head: cannot open ‘../data/ratings.parquet/part-00003-1e0c22a0-279b-437f-85c0-4009c4b4e16f-c000.snappy.parquet’ for reading: No such file or directory


In [29]:
df_parquet = spark.read.parquet("../data/ratings.parquet")

AnalysisException: u'Path does not exist: file:/root/note/data/ratings.parquet;'

In [94]:
df_parquet.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
|     1|    112|   3.5|1094785740|
|     1|    151|   4.0|1094785734|
|     1|    223|   4.0|1112485573|
|     1|    253|   4.0|1112484940|
|     1|    260|   4.0|1112484826|
|     1|    293|   4.0|1112484703|
|     1|    296|   4.0|1112484767|
|     1|    318|   4.0|1112484798|
|     1|    337|   3.5|1094785709|
|     1|    367|   3.5|1112485980|
|     1|    541|   4.0|1112484603|
|     1|    589|   3.5|1112485557|
|     1|    593|   3.5|1112484661|
|     1|    653|   3.0|1094785691|
|     1|    919|   3.5|1094785621|
+------+-------+------+----------+
only showing top 20 rows



In [95]:
df_parquet.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [96]:
df_parquet.describe().show()

+-------+-----------------+------------------+------------------+--------------------+
|summary|           userId|           movieId|            rating|           timestamp|
+-------+-----------------+------------------+------------------+--------------------+
|  count|         20000263|          20000263|          20000263|            20000263|
|   mean|69045.87258292554| 9041.567330339605|3.5255285642993797|1.1009179216771033E9|
| stddev| 40038.6266531599|19789.477445413315| 1.051988919294247|1.6216942478273004E8|
|    min|                1|                 1|               0.5|          1000000065|
|    max|            99999|             99999|               5.0|           999999978|
+-------+-----------------+------------------+------------------+--------------------+

