# Đọc dữ liệu từ HDFS

In [1]:
import json
from functools import reduce
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
            .master("local") \
            .appName("Tutorial") \
            .config("spark.some.config.option") \
            .getOrCreate()

In [2]:
def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

In [3]:
df = []
for i in range(10):
    df.append(spark.read.format("json")
              .load("hdfs://namenode:9000/user/root/input/data{}.json".format(i+1), multiLine = "true"))

In [4]:
df = unionAll(df[0], df[1], df[2], df[3], df[4], df[5], df[6], df[7], df[8], df[9])

In [5]:
df.printSchema()

root
 |-- androidVersion: string (nullable = true)
 |-- category: string (nullable = true)
 |-- comments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- contentRating: string (nullable = true)
 |-- currentVersion: string (nullable = true)
 |-- installs: long (nullable = true)
 |-- lastUpdate: long (nullable = true)
 |-- price: double (nullable = true)
 |-- ratings: long (nullable = true)
 |-- reviews: long (nullable = true)
 |-- score: double (nullable = true)
 |-- size: string (nullable = true)
 |-- title: string (nullable = true)



In [6]:
df.count()

188269

In [7]:
df = df.na.drop().dropDuplicates()

In [8]:
df = df.filter(size(df['comments']) >= 30)

In [9]:
df.count()

70745

In [10]:
df.head()

Row(androidVersion='4.1', category='Strategy', comments=['Fun game but its not clear when something will run out of fuel and how long it will take to repair. No gameplay issues.', "The game itself is really good I was very hooked until I bought an in app purchase, I didn't even receive half of what I payed for, I followed the advice on how to fix the issue and ended up losing all my game data and purchases as well. So I contacted the game developers and ask them instead of a refund could they add to my account the things that I had payed for. I am pleased to say the developer resolved the issue and I'm back enjoying a great game whoop.", 'Wow.. I really wanted to play a game like this.. if someone wanna play a game like this .. he must train his brain for combat and counter strike and air advancing ... artillery support and defence mechanisms when getting hard to offend .. I really appreciate a game.. and the hardness to play.. um really playing A mission For nearly 1 hrs .. cuz withou

In [17]:
df.groupby('androidVersion').count().sort('count', ascending=False).show()

+--------------+-----+
|androidVersion|count|
+--------------+-----+
|           4.1|19616|
|           5.0|12915|
|           4.4|10074|
|         4.0.3| 4507|
|           4.0| 4447|
|        Varies| 4281|
|           4.2| 3594|
|           6.0| 2331|
|           2.3| 1842|
|           4.3| 1721|
|           5.1|  963|
|           7.0|  945|
|           2.2|  772|
|         2.3.3|  609|
|           3.0|  508|
|           2.1|  485|
|           1.6|  279|
|           8.0|  200|
|          4.4W|  165|
|           2.0|  155|
+--------------+-----+
only showing top 20 rows

