In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from IPython.display import clear_output
import pyspark.sql.types as stypes
import os
from pyspark.sql import functions as F
from math import ceil

In [2]:
_conf = SparkConf()
_conf.set('spark.driver.host', '127.0.0.1')
_conf.set('log.level', 'ERROR')
_cpu_count = os.cpu_count()
sc = SparkContext(master=f"local[{_cpu_count}]",
                  appName="Exercise 01",
                  conf=_conf).getOrCreate()
sc.setLogLevel('ERROR')
clear_output()
spark = SparkSession(sc)

In [3]:
df = spark.read.load("/home/sima/ds/user_artist_data_small.txt",
                 format="csv", sep=" ", inferSchema="true", header="false").toDF("user","artist", "time")

                                                                                

In [4]:
df.show()

+-------+-------+-----+
|   user| artist| time|
+-------+-------+-----+
|1059637|1000010|  238|
|1059637|1000049|    1|
|1059637|1000056|    1|
|1059637|1000062|   11|
|1059637|1000094|    1|
|1059637|1000112|  423|
|1059637|1000113|    5|
|1059637|1000114|    2|
|1059637|1000123|    2|
|1059637|1000130|19129|
|1059637|1000139|    4|
|1059637|1000241|  188|
|1059637|1000263|  180|
|1059637|1000289|    2|
|1059637|1000305|    1|
|1059637|1000320|   21|
|1059637|1000340|    1|
|1059637|1000427|   20|
|1059637|1000428|   12|
|1059637|1000433|   10|
+-------+-------+-----+
only showing top 20 rows



In [5]:
df1 = spark.read.load("/home/sima/ds/artist_alias_small.txt",
                 format="csv", sep="\t", inferSchema="true", header="false").toDF("artis_correctname","artist1")

In [6]:
df1.show()

+-----------------+-------+
|artis_correctname|artist1|
+-----------------+-------+
|          1027859|1252408|
|          1017615|    668|
|          6745885|1268522|
|          1018110|1018110|
|          1014609|1014609|
|          6713071|   2976|
|          1014175|1014175|
|          1008798|1008798|
|          1013851|1013851|
|          6696814|1030672|
|          1036747|1239516|
|          1278781|1021980|
|          2035175|1007565|
|          1327067|1308328|
|          2006482|1140837|
|          1314530|1237371|
|          1160800|1345290|
|          1255401|1055061|
|          1307351|1055061|
|          1234249|1005225|
+-----------------+-------+
only showing top 20 rows



In [8]:
left_join = df.join(df1, df.artist == df1.artist1,how='left')

In [9]:
left_join.show()

+-------+-------+-----+-----------------+-------+
|   user| artist| time|artis_correctname|artist1|
+-------+-------+-----+-----------------+-------+
|1059637|1000010|  238|             null|   null|
|1059637|1000049|    1|             null|   null|
|1059637|1000056|    1|             null|   null|
|1059637|1000062|   11|             null|   null|
|1059637|1000094|    1|             null|   null|
|1059637|1000112|  423|             null|   null|
|1059637|1000113|    5|          9928973|1000113|
|1059637|1000114|    2|             null|   null|
|1059637|1000123|    2|          1146111|1000123|
|1059637|1000130|19129|             null|   null|
|1059637|1000139|    4|             null|   null|
|1059637|1000241|  188|             null|   null|
|1059637|1000263|  180|             null|   null|
|1059637|1000289|    2|             null|   null|
|1059637|1000305|    1|             null|   null|
|1059637|1000320|   21|             null|   null|
|1059637|1000340|    1|             null|   null|


In [10]:
from pyspark.sql.functions import expr
df3 = left_join.withColumn("new", expr("CASE WHEN artis_correctname is null THEN artist " + 
                     "ELSE artis_correctname END"))
df3.show()

+-------+-------+-----+-----------------+-------+-------+
|   user| artist| time|artis_correctname|artist1|    new|
+-------+-------+-----+-----------------+-------+-------+
|1059637|1000010|  238|             null|   null|1000010|
|1059637|1000049|    1|             null|   null|1000049|
|1059637|1000056|    1|             null|   null|1000056|
|1059637|1000062|   11|             null|   null|1000062|
|1059637|1000094|    1|             null|   null|1000094|
|1059637|1000112|  423|             null|   null|1000112|
|1059637|1000113|    5|          9928973|1000113|9928973|
|1059637|1000114|    2|             null|   null|1000114|
|1059637|1000123|    2|          1146111|1000123|1146111|
|1059637|1000130|19129|             null|   null|1000130|
|1059637|1000139|    4|             null|   null|1000139|
|1059637|1000241|  188|             null|   null|1000241|
|1059637|1000263|  180|             null|   null|1000263|
|1059637|1000289|    2|             null|   null|1000289|
|1059637|10003

In [13]:
df4=df3.select("user","new","time")
df4.show()

+-------+-------+-----+
|   user|    new| time|
+-------+-------+-----+
|1059637|1000010|  238|
|1059637|1000049|    1|
|1059637|1000056|    1|
|1059637|1000062|   11|
|1059637|1000094|    1|
|1059637|1000112|  423|
|1059637|9928973|    5|
|1059637|1000114|    2|
|1059637|1146111|    2|
|1059637|1000130|19129|
|1059637|1000139|    4|
|1059637|1000241|  188|
|1059637|1000263|  180|
|1059637|1000289|    2|
|1059637|1000305|    1|
|1059637|1000320|   21|
|1059637|1000340|    1|
|1059637|1000427|   20|
|1059637|1000428|   12|
|1059637|1000433|   10|
+-------+-------+-----+
only showing top 20 rows

