In [9]:
from pyspark.sql import SparkSession,Row,DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window


# Create SparkSession
spark=SparkSession.builder.master("local[1]")\
        .appName("Music")\
        .config("spark.driver.extraClassPath","/Users/eduardoalberto/opt/spark/jars/mysql-connector-j-8.2.0.jar" ) \
        .getOrCreate()
sc = spark.sparkContext
spark.sparkContext.setLogLevel("OFF") 
print('PySpark Version :'+spark.version)
print('PySpark Version :'+spark.sparkContext.version)

spark

PySpark Version :3.5.0
PySpark Version :3.5.0


In [43]:
   
def music(df_music: DataFrame ) -> DataFrame:
    df_music = df_music.selectExpr("cast(acousticness as decimal(3,2)) as acousticness",
                                    "regexp_replace(artists,'[^a-zA-Z0-9]',' ') as artists",
                                    "cast(danceability as decimal(3,2)) as danceability ",
                                    "duration_ms",
                                    "cast(energy as decimal(3,2)) as energy",
                                    "explicit",
                                    "cast(instrumentalness as decimal(3,2) ) as instrumentalness",
                                    "key",
                                    "cast (liveness as decimal(3,2) ) as liveness",
                                    "mode",
                                    "cast(loudness as decimal(3,2)) as loudness",
                                    "name",
                                    "popularity",
                                    """ case
                                            when release_date = '1928' then '1928-01-01' 
                                            else release_date
                                            end as release_date
                                    """,
                                    "cast(speechiness as decimal(5,2)) as speechiness",
                                    "cast(tempo as decimal(3,2)) as tempo",
                                    "cast(valence as decimal(3,2)) as valence",
                                    "year")
    
    df_music = df_music.withColumn("rowid", monotonically_increasing_id())
    
    return df_music


def genre(df_genre: DataFrame ) -> DataFrame:
    df_genre = df_genre.selectExpr("cast(danceability as decimal(3,2)) as danceability ",
                    "cast(energy as decimal(3,2)) as energy ",
                    "key ",
                    "cast(loudness as decimal(4,2)) as loudness ",
                    "mode",
                    "cast(speechiness as decimal(3,2)) as speechiness ",
                    "cast(acousticness as decimal(3,2)) as acousticness ",
                    "cast(instrumentalness as decimal(3,2)) as instrumentalness ",
                    "cast(liveness as decimal(3,2)) as liveness ",
                    "cast(valence as decimal(3,2)) as valence ",
                    "cast(tempo as decimal(3,2)) as tempo ",
                    "type",
                    "uri",
                    "track_href",
                    "analysis_url",
                    "duration_ms",
                    "time_signature",
                    "genre",
                    "song_name")
    df_genre = df_genre.withColumn("rowid", monotonically_increasing_id())

    return df_genre

def  artists(df_artists: DataFrame ) -> DataFrame:
    
    df_artists = df_artists.selectExpr("followers",
                    "regexp_replace(genres,'[^a-zA-Z0-9]',' ') as genres",
                    "name",
                    "popularity"
                    

    )
    df_artists = df_artists.withColumn("id", row_number().over(Window.partitionBy("name").orderBy("name")))
    df_artists = df_artists.selectExpr("id",
                    "trim(genres)",
                    "trim(regexp_replace(name, '[^a-zA-Z0-9]',' ') ) as name",
                    "trim(regexp_replace(popularity, '[^a-zA-Z0-9]',' ') ) as popularity",
                    "followers"
    )
    df_artists = df_artists.withColumn("rowid", monotonically_increasing_id())

    return df_artists

def playlists(df_playlists:DataFrame) -> DataFrame:
    df_playlists = df_playlists.select("Playlist","Genre")
    df_playlists = df_playlists.withColumn("rowid", monotonically_increasing_id())

    return df_playlists


def tracks(df_tracks: DataFrame ) -> DataFrame:
    df_tracks = df_tracks.selectExpr("id"
                    ,"name"
                    ,"popularity"
                    ,"duration_ms"
                    ,"explicit"
                    ,"regexp_replace('artists','[^a-zA-Z0-9]',' ') as artists "
                    ,"regexp_replace('id_artists','[^a-zA-Z0-9]',' ') as id_artists"

    )
    return df_tracks

    
    

    

In [42]:
txt01 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/data.csv')    
txt02 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/genres_v2.csv')
txt03 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/artists.csv')
txt04 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/playlists.csv')
txt05 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/tracks.csv')


df = music(txt01)
df.show()

df = genre(txt02)
df.show()

df = artists(txt03)
df.show()

df = playlists(txt04)
df.show()

df = tracks(txt05)
df.show()




                                                                                

+------------+--------------------+------------+-----------+------+--------+----------------+---+--------+----+--------+--------------------+----------+------------+-----------+-----+-------+----+-----+
|acousticness|             artists|danceability|duration_ms|energy|explicit|instrumentalness|key|liveness|mode|loudness|                name|popularity|release_date|speechiness|tempo|valence|year|rowid|
+------------+--------------------+------------+-----------+------+--------+----------------+---+--------+----+--------+--------------------+----------+------------+-----------+-----+-------+----+-----+
|        1.00|   Carl Woitschach  |        0.71|     158648|  0.20|       0|            0.56| 10|    0.15|   1|    NULL|Singende Bataillo...|         0|  1928-01-01|       0.05| NULL|   0.78|1928|    0|
|        0.99|  Robert Schumann...|        0.38|     282133|  0.01|       0|            0.90|  8|    0.08|   1|    NULL|Fantasiestücke, O...|         0|  1928-01-01|       0.05| NULL|   0.

[Stage 211:>                                                        (0 + 1) / 1]

+---+--------------------+----------------+--------------------+---------+-----+
| id|        trim(genres)|            name|          popularity|followers|rowid|
+---+--------------------+----------------+--------------------+---------+-----+
|  1|australian childr...| children s folk|    children s music|   8187.0|    0|
|  2|canadian children...| children s folk|    children s music|   4972.0|    1|
|  3|australian childr...| children s folk|    children s music|   8137.0|    2|
|  4|canadian children...| children s folk|    children s music|  10831.0|    3|
|  5|canadian children...| children s folk|    children s music|   3853.0|    4|
|  6|canadian children...| children s folk|    children s music|  88541.0|    5|
|  7|canadian children...| children s folk|    children s music|   3188.0|    6|
|  1|canadian children...| children s folk|          Al Simmons|    271.0|    7|
|  2|canadian children...| children s folk|        Jack Grunsky|    276.0|    8|
|  3|canadian children...| c

                                                                                

### Criaçao classe de Carga

In [56]:
from pyspark.sql import SparkSession,Row,DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

class RunJob():

    def __init__(self,music):
        self.__txt01  = music
    
    def music(self):

        df_music =  self.__txt01.selectExpr("cast(acousticness as decimal(3,2)) as acousticness",
                                            "regexp_replace(artists,'[^a-zA-Z0-9]',' ') as artists",
                                            "cast(danceability as decimal(3,2)) as danceability ",
                                            "duration_ms",
                                            "cast(energy as decimal(3,2)) as energy",
                                            "explicit",
                                            "cast(instrumentalness as decimal(3,2) ) as instrumentalness",
                                            "key",
                                            "cast (liveness as decimal(3,2) ) as liveness",
                                            "mode",
                                            "cast(loudness as decimal(3,2)) as loudness",
                                            "name",
                                            "popularity",
                                            """ case
                                                    when release_date = '1928' then '1928-01-01' 
                                                    else release_date
                                                end as release_date
                                            """,
                                            "cast(speechiness as decimal(5,2)) as speechiness",
                                            "cast(tempo as decimal(3,2)) as tempo",
                                            "cast(valence as decimal(3,2)) as valence",
                                            "year"
        )
        return df_music
    
    def genre(df_genre):

        df_genre = df_genre.selectExpr("cast(danceability as decimal(3,2)) as danceability ",
                        "cast(energy as decimal(3,2)) as energy ",
                        "key ",
                        "cast(loudness as decimal(4,2)) as loudness ",
                        "mode",
                        "cast(speechiness as decimal(3,2)) as speechiness ",
                        "cast(acousticness as decimal(3,2)) as acousticness ",
                        "cast(instrumentalness as decimal(3,2)) as instrumentalness ",
                        "cast(liveness as decimal(3,2)) as liveness ",
                        "cast(valence as decimal(3,2)) as valence ",
                        "cast(tempo as decimal(3,2)) as tempo ",
                        "type",
                        "uri",
                        "track_href",
                        "analysis_url",
                        "duration_ms",
                        "time_signature",
                        "genre",
                        "song_name"
        )
        df_genre = df_genre.withColumn("rowid", monotonically_increasing_id())

        return df_genre

    def  artists(df_artists ):
        
        df_artists = df_artists.selectExpr("followers",
                        "regexp_replace(genres,'[^a-zA-Z0-9]',' ') as genres",
                        "name",
                        "popularity"
                        

        )
        df_artists = df_artists.withColumn("id", row_number().over(Window.partitionBy("name").orderBy("name")))
        df_artists = df_artists.selectExpr("id",
                        "trim(genres)",
                        "trim(regexp_replace(name, '[^a-zA-Z0-9]',' ') ) as name",
                        "trim(regexp_replace(popularity, '[^a-zA-Z0-9]',' ') ) as popularity",
                        "followers"
        )
        df_artists = df_artists.withColumn("rowid", monotonically_increasing_id())

        return df_artists

    def playlists(df_playlists):
        df_playlists = df_playlists.select("Playlist","Genre")
        df_playlists = df_playlists.withColumn("rowid", monotonically_increasing_id())

        return df_playlists


    def tracks(df_tracks):
        df_tracks = df_tracks.selectExpr("id"
                        ,"name"
                        ,"popularity"
                        ,"duration_ms"
                        ,"explicit"
                        ,"regexp_replace('artists','[^a-zA-Z0-9]',' ') as artists "
                        ,"regexp_replace('id_artists','[^a-zA-Z0-9]',' ') as id_artists"

        )
        return df_tracks

    

In [55]:
txt01 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/data.csv')    

x = RunJob(txt01)
df = x.music()
df.show()  

+------------+--------------------+------------+-----------+------+--------+----------------+---+--------+----+--------+--------------------+----------+------------+-----------+-----+-------+----+
|acousticness|             artists|danceability|duration_ms|energy|explicit|instrumentalness|key|liveness|mode|loudness|                name|popularity|release_date|speechiness|tempo|valence|year|
+------------+--------------------+------------+-----------+------+--------+----------------+---+--------+----+--------+--------------------+----------+------------+-----------+-----+-------+----+
|        1.00|   Carl Woitschach  |        0.71|     158648|  0.20|       0|            0.56| 10|    0.15|   1|    NULL|Singende Bataillo...|         0|  1928-01-01|       0.05| NULL|   0.78|1928|
|        0.99|  Robert Schumann...|        0.38|     282133|  0.01|       0|            0.90|  8|    0.08|   1|    NULL|Fantasiestücke, O...|         0|  1928-01-01|       0.05| NULL|   0.08|1928|
|        0.60| 