In [10]:
from pyspark.sql import SparkSession,Row,DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window


# Create SparkSession
spark=SparkSession.builder.master("local[1]")\
        .appName("Music")\
        .config("spark.driver.extraClassPath","/Users/eduardoalberto/opt/spark/jars/mysql-connector-j-8.2.0.jar" ) \
        .getOrCreate()
sc = spark.sparkContext
spark.sparkContext.setLogLevel("OFF") 
print('PySpark Version :'+spark.version)
print('PySpark Version :'+spark.sparkContext.version)

spark

def music(df_music: DataFrame ) -> DataFrame:
    df_music = df_music.selectExpr("cast(acousticness as decimal(3,2)) as acousticness",
                                    "regexp_replace(artists,'[^a-zA-Z0-9]',' ') as artists",
                                    "cast(danceability as decimal(3,2)) as danceability ",
                                    "duration_ms",
                                    "cast(energy as decimal(3,2)) as energy",
                                    "explicit",
                                    "cast(instrumentalness as decimal(3,2) ) as instrumentalness",
                                    "key",
                                    "cast (liveness as decimal(3,2) ) as liveness",
                                    "mode",
                                    "cast(loudness as decimal(3,2)) as loudness",
                                    "name",
                                    "popularity",
                                    """ case
                                            when release_date = '1928' then '1928-01-01' 
                                            else release_date
                                            end as release_date
                                    """,
                                    "cast(speechiness as decimal(5,2)) as speechiness",
                                    "cast(tempo as decimal(3,2)) as tempo",
                                    "cast(valence as decimal(3,2)) as valence",
                                    "year")
    
    df_music = df_music.withColumn("rowid", monotonically_increasing_id())
    
    return df_music


def genre(df_genre: DataFrame ) -> DataFrame:
    df_genre = df_genre.selectExpr("cast(danceability as decimal(3,2)) as danceability ",
                    "cast(energy as decimal(3,2)) as energy ",
                    "key ",
                    "cast(loudness as decimal(4,2)) as loudness ",
                    "mode",
                    "cast(speechiness as decimal(3,2)) as speechiness ",
                    "cast(acousticness as decimal(3,2)) as acousticness ",
                    "cast(instrumentalness as decimal(3,2)) as instrumentalness ",
                    "cast(liveness as decimal(3,2)) as liveness ",
                    "cast(valence as decimal(3,2)) as valence ",
                    "cast(tempo as decimal(3,2)) as tempo ",
                    "type",
                    "uri",
                    "track_href",
                    "analysis_url",
                    "duration_ms",
                    "time_signature",
                    "genre",
                    "song_name")
    df_genre = df_genre.withColumn("rowid", monotonically_increasing_id())

    return df_genre

def  artists(df_artists: DataFrame ) -> DataFrame:
    
    df_artists = df_artists.selectExpr("followers",
                    "regexp_replace(genres,'[^a-zA-Z0-9]',' ') as genres",
                    "name",
                    "popularity"
                    

    )
    df_artists = df_artists.withColumn("id", row_number().over(Window.partitionBy("name").orderBy("name")))
    df_artists = df_artists.selectExpr("id",
                    "trim(genres)",
                    "trim(regexp_replace(name, '[^a-zA-Z0-9]',' ') ) as name",
                    "trim(regexp_replace(popularity, '[^a-zA-Z0-9]',' ') ) as popularity",
                    "followers"
    )
    df_artists = df_artists.withColumn("rowid", monotonically_increasing_id())

    return df_artists

def playlists(df_playlists:DataFrame) -> DataFrame:
    df_playlists = df_playlists.select("Playlist","Genre")
    df_playlists = df_playlists.withColumn("rowid", monotonically_increasing_id())

    return df_playlists


def tracks(df_tracks: DataFrame ) -> DataFrame:
    df_tracks = df_tracks.selectExpr("id"
                    ,"name"
                    ,"popularity"
                    ,"duration_ms"
                    ,"explicit"
                    ,"regexp_replace('artists','[^a-zA-Z0-9]',' ') as artists "
                    ,"regexp_replace('id_artists','[^a-zA-Z0-9]',' ') as id_artists"

    )
    return df_tracks

    
    

    

PySpark Version :3.5.0
PySpark Version :3.5.0


In [11]:
txt01 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/data.csv')    
txt02 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/genres_v2.csv')
txt03 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/artists.csv')
txt04 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/playlists.csv')
txt05 = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/tracks.csv')


df = music(txt01)
df.show()

# df = genre(txt02)
# df.show()

# df = artists(txt03)
# df.show()

# df = playlists(txt04)
# df.show()

# df = tracks(txt05)
# df.show()




                                                                                

+------------+--------------------+------------+-----------+------+--------+----------------+---+--------+----+--------+--------------------+----------+------------+-----------+-----+-------+----+-----+
|acousticness|             artists|danceability|duration_ms|energy|explicit|instrumentalness|key|liveness|mode|loudness|                name|popularity|release_date|speechiness|tempo|valence|year|rowid|
+------------+--------------------+------------+-----------+------+--------+----------------+---+--------+----+--------+--------------------+----------+------------+-----------+-----+-------+----+-----+
|        1.00|   Carl Woitschach  |        0.71|     158648|  0.20|       0|            0.56| 10|    0.15|   1|    NULL|Singende Bataillo...|         0|  1928-01-01|       0.05| NULL|   0.78|1928|    0|
|        0.99|  Robert Schumann...|        0.38|     282133|  0.01|       0|            0.90|  8|    0.08|   1|    NULL|Fantasiestücke, O...|         0|  1928-01-01|       0.05| NULL|   0.

### Criaçao classe de Carga

In [29]:
from pyspark.sql import SparkSession,Row,DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window


# Create SparkSession
spark=SparkSession.builder.master("local[1]")\
        .appName("Music")\
        .config("spark.driver.extraClassPath","/Users/eduardoalberto/opt/spark/jars/mysql-connector-j-8.2.0.jar" ) \
        .getOrCreate()
sc = spark.sparkContext
spark.sparkContext.setLogLevel("OFF") 
print('PySpark Version :'+spark.version)
print('PySpark Version :'+spark.sparkContext.version)
spark.sparkContext.setLogLevel("OFF")

class RunJob():
    def __init__(self,music,genres,artists,playlists,tracks):
        self.__txt01  = music
        self.__txt02  = genres
        self.__txt03  = artists
        self.__txt04  = playlists
        self.__txt05  = tracks

    def music(self) -> DataFrame:
        df_music = self.__txt01.selectExpr("cast(acousticness as decimal(3,2)) as acousticness",
                                        "regexp_replace(artists,'[^a-zA-Z0-9]',' ') as artists",
                                        "cast(danceability as decimal(3,2)) as danceability ",
                                        "duration_ms",
                                        "cast(energy as decimal(3,2)) as energy",
                                        "explicit",
                                        "cast(instrumentalness as decimal(3,2) ) as instrumentalness",
                                        "key",
                                        "cast (liveness as decimal(3,2) ) as liveness",
                                        "mode",
                                        "cast(loudness as decimal(3,2)) as loudness",
                                        "name",
                                        "popularity",
                                        """ case
                                                when release_date = '1928' then '1928-01-01' 
                                                else release_date
                                                end as release_date
                                        """,
                                        "cast(speechiness as decimal(5,2)) as speechiness",
                                        "cast(tempo as decimal(3,2)) as tempo",
                                        "cast(valence as decimal(3,2)) as valence",
                                        "year")
        
        df_music = df_music.withColumn("rowid", monotonically_increasing_id())
        
        return df_music


    def genre(self) -> DataFrame:
        df_genre = self.__txt02.selectExpr("cast(danceability as decimal(3,2)) as danceability ",
                        "cast(energy as decimal(3,2)) as energy ",
                        "key ",
                        "cast(loudness as decimal(4,2)) as loudness ",
                        "mode",
                        "cast(speechiness as decimal(3,2)) as speechiness ",
                        "cast(acousticness as decimal(3,2)) as acousticness ",
                        "cast(instrumentalness as decimal(3,2)) as instrumentalness ",
                        "cast(liveness as decimal(3,2)) as liveness ",
                        "cast(valence as decimal(3,2)) as valence ",
                        "cast(tempo as decimal(3,2)) as tempo ",
                        "type",
                        "uri",
                        "track_href",
                        "analysis_url",
                        "duration_ms",
                        "time_signature",
                        "genre",
                        "song_name")
        df_genre = df_genre.withColumn("rowid", monotonically_increasing_id())

        return df_genre

    def  artists(self) -> DataFrame:
        
        df_artists = self.__txt03.selectExpr("followers",
                        "regexp_replace(genres,'[^a-zA-Z0-9]',' ') as genres",
                        "name",
                        "popularity"
                        

        )
        df_artists = df_artists.withColumn("id", row_number().over(Window.partitionBy("name").orderBy("name")))
        df_artists = df_artists.selectExpr("id",
                        "trim(genres)",
                        "trim(regexp_replace(name, '[^a-zA-Z0-9]',' ') ) as name",
                        "trim(regexp_replace(popularity, '[^a-zA-Z0-9]',' ') ) as popularity",
                        "followers"
        )
        df_artists = df_artists.withColumn("rowid", monotonically_increasing_id())

        return df_artists

    def playlists(self) -> DataFrame:
        df_playlists = self.__txt04.select("Playlist","Genre")
        df_playlists = df_playlists.withColumn("rowid", monotonically_increasing_id())

        return df_playlists


    def tracks(self) -> DataFrame:
        df_tracks = self.__txt05.selectExpr("id"
                        ,"name"
                        ,"popularity"
                        ,"duration_ms"
                        ,"explicit"
                        ,"regexp_replace('artists','[^a-zA-Z0-9]',' ') as artists "
                        ,"regexp_replace('id_artists','[^a-zA-Z0-9]',' ') as id_artists"

        )
        return df_tracks
    
music = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/data.csv')    
genre = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/genres_v2.csv')
artists = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/artists.csv')
playlists = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/playlists.csv')
tracks = spark.read.option("header", "true").option('inferSchema', 'true').csv('/Users/eduardoalberto/LoadFile/archive/tracks.csv')

x = RunJob(music,genre,artists,playlists,tracks)
df = x.tracks()
df.show()  
    

PySpark Version :3.5.0
PySpark Version :3.5.0


[Stage 219:>                                                        (0 + 1) / 1]

+--------------------+--------------------+----------+-----------+--------+-------+----------+
|                  id|                name|popularity|duration_ms|explicit|artists|id_artists|
+--------------------+--------------------+----------+-----------+--------+-------+----------+
|35iwgR4jXetI318WE...|               Carve|         6|     126903|       0|artists|id artists|
|021ht4sdgPcrDgSk7...|Capítulo 2.16 - B...|         0|      98200|       0|artists|id artists|
|07A5yehtSnoedViJA...|Vivo para Querert...|         0|     181640|       0|artists|id artists|
|08FmqUhxtyLTn6pAh...|El Prisionero - R...|         0|     176907|       0|artists|id artists|
|08y9GfoqCWfOGsKdw...| Lady of the Evening|         0|     163080|       0|artists|id artists|
|0BRXJHRNGQ3W4v9fr...|           Ave Maria|         0|     178933|       0|artists|id artists|
|0Dd9ImXtAtGwsmsAD...|      La Butte Rouge|         0|     134467|       0|artists|id artists|
|0IA0Hju8CAgYfV1hw...|             La Java|       

                                                                                