In [1]:
!pip install pyspark --quiet

[K     |████████████████████████████████| 281.4 MB 29 kB/s 
[K     |████████████████████████████████| 198 kB 62.6 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("Pyspark-1")
sc = SparkContext(conf = conf)

In [None]:
#  user ID, a movie ID, a rating, and a timestamp
file_obj = open(r"/content/drive/MyDrive/RTA/Pyspark/data/u.data")
file_obj2 = open(r"/content/drive/MyDrive/RTA/Pyspark/data/u_data.txt","w+")
lines = file_obj.readlines()
# file_obj2.write("user_ID,movie_ID,rating,timestamp")
file_obj2.write("\n")
for line in lines:
  file_obj2.write(line.replace('\t',','))
  

# Popular-Movie - with RDD

In [3]:
rdd = sc.parallelize([1,2,3,4])

In [4]:
#  user ID, a movie ID, a rating, and a timestamp
lines = sc.textFile(r"/content/drive/MyDrive/RTA/Pyspark/data/u_data.txt")
header = lines.first() #extract header
lines = lines.filter(lambda x:x != header) 
movies = lines.map(lambda x:(int(x.split(",")[1]),1))
# movies.top(5)

movieCounts = movies.reduceByKey(lambda x,y:x+y)
# movieCounts.top(5)

flipped = movieCounts.map(lambda x: (x[1],x[0]))
sortedMovies = flipped.sortByKey(ascending=False) 
results=sortedMovies.collect()
top=0
for result in results: 
    if top<10: print(result); top+=1
    else: break

(583, 50)
(509, 258)
(508, 100)
(507, 181)
(485, 294)
(481, 286)
(478, 288)
(452, 1)
(431, 300)
(429, 121)


# Popular-Movie with Dataframe

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count
spark = (SparkSession.builder.appName("Pyspark-1").getOrCreate())

In [15]:
movies = (spark.read.format("csv")
            .load("/content/drive/MyDrive/RTA/Pyspark/data/u_data.txt"))

In [16]:
movie_counts = (movies.select("_c1")
                  .groupBy("_c1").count()
                  .orderBy("count", ascending=False)
                  )

In [17]:
movie_counts.show(10)

+---+-----+
|_c1|count|
+---+-----+
| 50|  583|
|258|  509|
|100|  508|
|181|  507|
|294|  485|
|286|  481|
|288|  478|
|  1|  452|
|300|  431|
|121|  429|
+---+-----+
only showing top 10 rows



# using SparkSQL

In [28]:
movies = (spark.read.format("csv")
            .load("/content/drive/MyDrive/RTA/Pyspark/data/u_data.txt"))

movies.createOrReplaceTempView("movies_table")

In [42]:
spark.sql("""
select _c1,count(*) as count
from movies_table 
group by _c1
order by count DESC
""").show(10)

+---+-----+
|_c1|count|
+---+-----+
| 50|  583|
|258|  509|
|100|  508|
|181|  507|
|294|  485|
|286|  481|
|288|  478|
|  1|  452|
|300|  431|
|121|  429|
+---+-----+
only showing top 10 rows



# Average with RDD

In [20]:
#calculate average groupBy
dataRDD = sc.parallelize([("A",20),("B",10),("C",30),("A",10),("B",20),("C",10),("A",30)])

trnsfrmRDD = (dataRDD.map(lambda x:(x[0],(x[1],1)))
                .reduceByKey(lambda x,y:(x[0]+y[0],x[1]+y[1]))
                .map(lambda x:(x[0],x[1][0]/x[1][1]))
              )

In [23]:
trnsfrmRDD.collect()

[('A', 20.0), ('B', 15.0), ('C', 20.0)]

# Average with Dataframe

In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

spark = (SparkSession.builder
          .appName("Average")
          .getOrCreate()
          )

In [26]:
data_df = spark.createDataFrame([("A",20),("B",10),("C",30),("A",10),("B",20),("C",10),("A",30)],["key","value"])
avg_df = data_df.groupBy("key").agg(avg("value"))
avg_df.show() 

+---+----------+
|key|avg(value)|
+---+----------+
|  B|      15.0|
|  C|      20.0|
|  A|      20.0|
+---+----------+

