# Preparation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.sql import SparkSession
from pyspark.sql import Row
# from pyspark.sql.types import StringType, StructType, StructField
from pyspark.sql.types import *
# import pyspark.sql.functions as F
from pyspark.sql.functions import *

import warnings
warnings.filterwarnings('ignore')

# Q1

## (1)

In [2]:
spark = SparkSession.builder.config('spark.ui.port', 64050).appName("ass2_Q1").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/09 12:00:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df1_1 = spark.read.text("data/Q1_data/protein.fasta")
df1_1 = df1_1.filter(~col("value").contains(">"))

df1_1.show(5, truncate=False)

+------------------------------------------------------------+
|value                                                       |
+------------------------------------------------------------+
|MEEITQIKKRLSQTVRLEGKEDLLSKKDSITNLKTEEHVSVKKMVISEPKPEKKEDIQLK|
|KKEVVAVAKKEEVLKKEVVVPSKKDEEILPLKKEVPRPPKKEEDVMPQKKEVPRPPKKEE|
|DIVPQMRDVSLPPKEEEKIVPKKKEVPRPPKKVEEILPPKKEVHRPPKKEEDIVPQIREV|
|SLPPKKDEEIVCEKKEVAPAKEEPSKKPKVPSLPATQREDVIEEIIHKKPTAALSKFEDV|
|KEHEEKETFVVLKKEIIDAPTKKEMVTAKHVIVPQKEEIIPSPTQEEVVSFKRKQTVRTS|
+------------------------------------------------------------+
only showing top 5 rows



In [4]:
df1_1_withchars = df1_1.withColumn("chars", explode(split(col("value"), "")))
df1_1_withchars.show(5,truncate=False)

+------------------------------------------------------------+-----+
|value                                                       |chars|
+------------------------------------------------------------+-----+
|MEEITQIKKRLSQTVRLEGKEDLLSKKDSITNLKTEEHVSVKKMVISEPKPEKKEDIQLK|M    |
|MEEITQIKKRLSQTVRLEGKEDLLSKKDSITNLKTEEHVSVKKMVISEPKPEKKEDIQLK|E    |
|MEEITQIKKRLSQTVRLEGKEDLLSKKDSITNLKTEEHVSVKKMVISEPKPEKKEDIQLK|E    |
|MEEITQIKKRLSQTVRLEGKEDLLSKKDSITNLKTEEHVSVKKMVISEPKPEKKEDIQLK|I    |
|MEEITQIKKRLSQTVRLEGKEDLLSKKDSITNLKTEEHVSVKKMVISEPKPEKKEDIQLK|T    |
+------------------------------------------------------------+-----+
only showing top 5 rows



In [5]:
df1_1_withchars.groupBy("chars").count().withColumn("freq",col("count")/df1_1_withchars.count()).show(5)

+-----+-------+--------------------+
|chars|  count|                freq|
+-----+-------+--------------------+
|    K|1684031|0.047061932111059086|
|    F| 985877|0.027551319687021555|
|    Q|1422769|  0.0397607039821235|
|    E|2674664| 0.07474616297912196|
|    T|2795042| 0.07811024669472166|
+-----+-------+--------------------+
only showing top 5 rows



## (2)

In [6]:
rdd1_2 = spark.sparkContext.textFile("data/Q1_data/protein.fasta")
rdd1_2 = rdd1_2.filter(lambda x: ">" not in x)
rdd1_2.take(5)

                                                                                

['MEEITQIKKRLSQTVRLEGKEDLLSKKDSITNLKTEEHVSVKKMVISEPKPEKKEDIQLK',
 'KKEVVAVAKKEEVLKKEVVVPSKKDEEILPLKKEVPRPPKKEEDVMPQKKEVPRPPKKEE',
 'DIVPQMRDVSLPPKEEEKIVPKKKEVPRPPKKVEEILPPKKEVHRPPKKEEDIVPQIREV',
 'SLPPKKDEEIVCEKKEVAPAKEEPSKKPKVPSLPATQREDVIEEIIHKKPTAALSKFEDV',
 'KEHEEKETFVVLKKEIIDAPTKKEMVTAKHVIVPQKEEIIPSPTQEEVVSFKRKQTVRTS']

In [7]:
rdd1_2_withchars = rdd1_2.flatMap(lambda x: list(x))
rdd1_2_withchars.take(5)

['M', 'E', 'E', 'I', 'T']

In [8]:
counts = rdd1_2_withchars.countByValue()
total_count = rdd1_2_withchars.count()

frequencies = {k: v / total_count for k, v in counts.items()}
frequencies

                                                                                

{'M': 0.013064028899518616,
 'E': 0.07474616297912196,
 'I': 0.04826036842051577,
 'T': 0.07811024669472166,
 'Q': 0.0397607039821235,
 'K': 0.047061932111059086,
 'R': 0.05001252679497514,
 'L': 0.07969207419272037,
 'S': 0.0767899658206434,
 'V': 0.07715222983238408,
 'G': 0.07415264580860985,
 'D': 0.06195360153390011,
 'N': 0.03680178138989157,
 'H': 0.0175608199300819,
 'P': 0.05862931292380984,
 'A': 0.0900722250424395,
 'C': 0.015774260028317683,
 'F': 0.027551319687021555,
 'Y': 0.023002241688475027,
 'W': 0.009813685408233087,
 'X': 3.7643263427808984e-05,
 'B': 1.6767600635995093e-07,
 'Z': 5.589200211998364e-08}

In [9]:
rdd1_2_withchars.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).take(5)

                                                                                

[('M', 467474), ('E', 2674664), ('I', 1726915), ('Q', 1422769), ('K', 1684031)]

## (3)

In [10]:
rdd1_2.take(5)


['MEEITQIKKRLSQTVRLEGKEDLLSKKDSITNLKTEEHVSVKKMVISEPKPEKKEDIQLK',
 'KKEVVAVAKKEEVLKKEVVVPSKKDEEILPLKKEVPRPPKKEEDVMPQKKEVPRPPKKEE',
 'DIVPQMRDVSLPPKEEEKIVPKKKEVPRPPKKVEEILPPKKEVHRPPKKEEDIVPQIREV',
 'SLPPKKDEEIVCEKKEVAPAKEEPSKKPKVPSLPATQREDVIEEIIHKKPTAALSKFEDV',
 'KEHEEKETFVVLKKEIIDAPTKKEMVTAKHVIVPQKEEIIPSPTQEEVVSFKRKQTVRTS']

In [11]:
import re
rdd1_2.map(lambda x: list(re.findall("STAT",x))).filter(lambda x: len(x)!=0).flatMap(lambda x: x).count()

2052

In [12]:
spark.stop()

# Q2

In [13]:
spark = SparkSession.builder.config('spark.ui.port', 64050).appName("ass2_Q2").getOrCreate()


## (1)

In [14]:
course = spark.read.csv("data/Q2_data/courses.csv",header=True)
course = course.withColumnRenamed("title","course_title")
course = course.withColumn("created",to_timestamp("created", "yyyy-MM-dd'T'HH:mm:ss'Z'"))
course.show(5)

+-------+--------------------+--------------------+---------+-----------+----------------------+-------------------+----------------+----------------+--------------+--------------------+
|     id|        course_title|                 url|   rating|num_reviews|num_published_lectures|            created|last_update_date|        duration|instructors_id|               image|
+-------+--------------------+--------------------+---------+-----------+----------------------+-------------------+----------------+----------------+--------------+--------------------+
| 567828|The Complete Pyth...|/course/complete-...|4.5927815|     452973|                   155|2015-07-29 00:12:23|      2021-03-14|  22 total hours|       9685726|https://img-c.ude...|
|1565838|The Complete 2023...|/course/the-compl...| 4.667258|     263152|                   490|2018-02-22 12:02:33|      2023-01-20|65.5 total hours|      31334738|https://img-c.ude...|
| 625204|The Web Developer...|/course/the-web-d...|4.6961474|    

In [15]:
instructors = spark.read.csv("data/Q2_data/instructors.csv",header=True)
instructors = instructors.withColumnRenamed("title","instructor_title")
instructors.show(5)

+------+--------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+
|_class|      id|    instructor_title|      name|        display_name|           job_title|         image_50x50|       image_100x100|initials|                 url|
+------+--------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+
|  user| 9685726|       Jose Portilla|      Jose|       Jose Portilla|Head of Data Scie...|https://img-c.ude...|https://img-c.ude...|      JP| /user/joseportilla/|
|  user|31334738|       Dr. Angela Yu|Dr. Angela|       Dr. Angela Yu|Developer and Lea...|https://img-c.ude...|https://img-c.ude...|      DY|/user/4b4368a3-b5...|
|  user| 4466306|         Colt Steele|      Colt|         Colt Steele|Developer and Boo...|https://img-b.ude...|https://img-b.ude...|      CS|   /user/coltsteele/|
|  user|13952972

In [16]:
print(course.count(),"\n",course.columns)
print(instructors.count(),"\n",instructors.columns)


83104 
 ['id', 'course_title', 'url', 'rating', 'num_reviews', 'num_published_lectures', 'created', 'last_update_date', 'duration', 'instructors_id', 'image']
32233 
 ['_class', 'id', 'instructor_title', 'name', 'display_name', 'job_title', 'image_50x50', 'image_100x100', 'initials', 'url']


In [17]:
df2_1 = course.join(instructors, course["instructors_id"] == instructors["id"], "inner")
print(df2_1.count())
df2_1.show(3)

83094
+-------+--------------------+--------------------+---------+-----------+----------------------+-------------------+----------------+----------------+--------------+--------------------+------+--------+----------------+----------+-------------+--------------------+--------------------+--------------------+--------+--------------------+
|     id|        course_title|                 url|   rating|num_reviews|num_published_lectures|            created|last_update_date|        duration|instructors_id|               image|_class|      id|instructor_title|      name| display_name|           job_title|         image_50x50|       image_100x100|initials|                 url|
+-------+--------------------+--------------------+---------+-----------+----------------------+-------------------+----------------+----------------+--------------+--------------------+------+--------+----------------+----------+-------------+--------------------+--------------------+--------------------+--------+--

In [18]:
df2_1.select(col("instructors_id")).distinct().count()

32230

## (2)

In [22]:
df2_1.createOrReplaceTempView("df2_1")
spark.sql("select display_name, job_title from df2_1 where course_title like '%spark%' and created > '2018-01-01 00:00:00' order by rating desc LIMIT 1").show(truncate=False)
# spark.sql("select * from df2_1 where course_title like '%spark%' and created > '2018-01-01 00:00:00' order by rating desc LIMIT 1").show(truncate=False)


+------------+-------------------------------------+
|display_name|job_title                            |
+------------+-------------------------------------+
|Deby Coles  |Sewer, Artist, Crafter and Instructor|
+------------+-------------------------------------+



## (3)

In [20]:
course.createOrReplaceTempView("course")
spark.sql("select course_title  as course,round(rating,1) as rating , created from course where course_title like '%interview%' order by rating desc, created desc").show(5,truncate=False)


+------------------------------------------------------------+------+-------------------+
|course                                                      |rating|created            |
+------------------------------------------------------------+------+-------------------+
|Réaliser des interviews au rendu professionnel (PARTIE 2)   |4.9   |2022-08-12 14:54:06|
|Win your Product Management job interview with Big Tech's PM|4.8   |2022-08-26 10:43:53|
|Get your Java dream job! Beginners interview preparation    |4.8   |2017-03-25 22:54:38|
|Angular interview questions with answers                    |4.6   |2020-05-02 06:13:45|
|Software Testing Interview Masterclass: Ace the QA interview|4.6   |2019-12-14 19:54:00|
+------------------------------------------------------------+------+-------------------+
only showing top 5 rows

