In [98]:
pip install pyspark



##**Dependencies**

In [99]:
from google.colab import drive
from pyspark.sql import SparkSession
from pyspark.sql.functions import lower, regexp_replace ,udf ,split
from pyspark.sql.types import StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
import pandas as pd
from pyspark.sql.functions import col, count, when, isnan, sum
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##**Importing Dataset**


In [100]:
df1=pd.read_csv('/content/songdata.csv')

##**Data Exploration/Preparation**

In [102]:
spark=SparkSession.builder.appName("SongDataApp").getOrCreate()

In [103]:
df = spark.createDataFrame(df1)

In [104]:
df_tail = df.orderBy(df['artist'].desc()).limit(10)
df_tail.show()

+------+-------------------+--------------------+--------------------+
|artist|               song|                link|                text|
+------+-------------------+--------------------+--------------------+
|  Zwan|   Baby Let's Rock!|/z/zwan/baby+lets...|Baby Let's Rock! ...|
|  Zwan|       Come With Me|/z/zwan/come+with...|all you need  \ni...|
|  Zwan|        To Love You|/z/zwan/to+love+y...|In overdrive  \nI...|
|  Zwan|             Desire|/z/zwan/desire_20...|northern star  \n...|
|  Zwan|             El Sol|/z/zwan/el+sol_20...|empty-armed  \nan...|
|  Zwan|          Heartsong|/z/zwan/heartsong...|come in  \nmake y...|
|  Zwan|     Endless Summer|/z/zwan/endless+s...|there used to be ...|
|  Zwan|           Honestly|/z/zwan/honestly_...|I believe  \nI be...|
|  Zwan|              Lyric|/z/zwan/lyric_201...|here comes my fai...|
|  Zwan|Number Of The Beast|/z/zwan/number+of...|Left alone, my mi...|
+------+-------------------+--------------------+--------------------+



In [105]:
df=df.limit(5000)

In [106]:
df.show()

+------+--------------------+--------------------+--------------------+
|artist|                song|                link|                text|
+------+--------------------+--------------------+--------------------+
|  ABBA|Ahe's My Kind Of ...|/a/abba/ahes+my+k...|Look at her face,...|
|  ABBA|    Andante, Andante|/a/abba/andante+a...|Take it easy with...|
|  ABBA|      As Good As New|/a/abba/as+good+a...|I'll never know w...|
|  ABBA|                Bang|/a/abba/bang_2059...|Making somebody h...|
|  ABBA|    Bang-A-Boomerang|/a/abba/bang+a+bo...|Making somebody h...|
|  ABBA|  Burning My Bridges|/a/abba/burning+m...|Well, you hoot an...|
|  ABBA|           Cassandra|/a/abba/cassandra...|Down in the stree...|
|  ABBA|          Chiquitita|/a/abba/chiquitit...|Chiquitita, tell ...|
|  ABBA|         Crazy World|/a/abba/crazy+wor...|I was out with th...|
|  ABBA|     Crying Over You|/a/abba/crying+ov...|I'm waitin' for y...|
|  ABBA|               Dance|/a/abba/dance_100...|Oh, my love it

In [None]:
# Print DataFrame schema
df.printSchema()

In [110]:
# Drop 'link' column
df = df.drop('link')

In [111]:
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

+------+----+----+
|artist|song|text|
+------+----+----+
|     0|   0|   0|
+------+----+----+



In [113]:
# Convert 'text' to lowercase, remove punctuation and newline characters
df = df.withColumn('text', regexp_replace(lower(df['text']), r'[^\w\s]', ' '))
df.show()

+------+--------------------+--------------------+
|artist|                song|                text|
+------+--------------------+--------------------+
|  ABBA|Ahe's My Kind Of ...|look at her face ...|
|  ABBA|    Andante, Andante|take it easy with...|
|  ABBA|      As Good As New|i ll never know w...|
|  ABBA|                Bang|making somebody h...|
|  ABBA|    Bang-A-Boomerang|making somebody h...|
|  ABBA|  Burning My Bridges|well  you hoot an...|
|  ABBA|           Cassandra|down in the stree...|
|  ABBA|          Chiquitita|chiquitita  tell ...|
|  ABBA|         Crazy World|i was out with th...|
|  ABBA|     Crying Over You|i m waitin  for y...|
|  ABBA|               Dance|oh  my love it ma...|
|  ABBA|       Dancing Queen|you can dance  yo...|
|  ABBA|         Disillusion|changing  moving ...|
|  ABBA|Does Your Mother ...|you re so hot  te...|
|  ABBA|         Dream World|agnetha we re not...|
|  ABBA|      Dum Dum Diddle|i can hear how yo...|
|  ABBA|               Eagle|th

In [114]:
df.select('text').head(1)

[Row(text='look at her face  it s a wonderful face  \nand it means something special to me  \nlook at the way that she smiles when she sees me  \nhow lucky can one fellow be   \n  \nshe s just my kind of girl  she makes me feel fine  \nwho could ever believe that she could be mine   \nshe s just my kind of girl  without her i m blue  \nand if she ever leaves me what could i do  what could i do   \n  \nand when we go for a walk in the park  \nand she holds me and squeezes my hand  \nwe ll go on walking for hours and talking  \nabout all the things that we plan  \n  \nshe s just my kind of girl  she makes me feel fine  \nwho could ever believe that she could be mine   \nshe s just my kind of girl  without her i m blue  \nand if she ever leaves me what could i do  what could i do \n\n')]

In [115]:
# Removing "/n" from text
df = df.withColumn('text', regexp_replace(df['text'], r'\n', ' '))
df.select('text').head(1)


[Row(text='look at her face  it s a wonderful face   and it means something special to me   look at the way that she smiles when she sees me   how lucky can one fellow be       she s just my kind of girl  she makes me feel fine   who could ever believe that she could be mine    she s just my kind of girl  without her i m blue   and if she ever leaves me what could i do  what could i do       and when we go for a walk in the park   and she holds me and squeezes my hand   we ll go on walking for hours and talking   about all the things that we plan      she s just my kind of girl  she makes me feel fine   who could ever believe that she could be mine    she s just my kind of girl  without her i m blue   and if she ever leaves me what could i do  what could i do   ')]

## Feature Engineering


* Create TF-IDF features
* Pair_wise Cosine_similarity matrix



In [116]:
# Initialize the Porter stemmer from NLTK
# PorterStemmer is a stemming algorithm that reduces words to their base or root form


In [117]:
stemmer = PorterStemmer()

In [118]:
def tokenize_text(text):
    tokens= word_tokenize(text)    # tokenize the input text into individual words
    stemming=[stemmer.stem(token) for token in tokens]
    return " ".join(stemming)      # returns the stemmed words joined into a single string

In [119]:
# Register the UDFs
stem_udf = udf(tokenize_text, StringType())

In [120]:
# Apply the UDFs to the 'text' column
df = df.withColumn('text', stem_udf(df['text']))


In [121]:
# Show the DataFrame with tokenized and stemmed text
df.select('text').show()

+--------------------+
|                text|
+--------------------+
|look at her face ...|
|take it easi with...|
|i ll never know w...|
|make somebodi hap...|
|make somebodi hap...|
|well you hoot and...|
|down in the stree...|
|chiquitita tell m...|
|i wa out with the...|
|i m waitin for yo...|
|oh my love it mak...|
|you can danc you ...|
|chang move in a c...|
|you re so hot tea...|
|agnetha we re not...|
|i can hear how yo...|
|they came fli fro...|
|everi good man ne...|
|can you hear the ...|
|pued escuchar fer...|
+--------------------+
only showing top 20 rows



In [123]:
df.show() # Final cleaned dataframe after Pre-processing

+------+--------------------+--------------------+
|artist|                song|                text|
+------+--------------------+--------------------+
|  ABBA|Ahe's My Kind Of ...|look at her face ...|
|  ABBA|    Andante, Andante|take it easi with...|
|  ABBA|      As Good As New|i ll never know w...|
|  ABBA|                Bang|make somebodi hap...|
|  ABBA|    Bang-A-Boomerang|make somebodi hap...|
|  ABBA|  Burning My Bridges|well you hoot and...|
|  ABBA|           Cassandra|down in the stree...|
|  ABBA|          Chiquitita|chiquitita tell m...|
|  ABBA|         Crazy World|i wa out with the...|
|  ABBA|     Crying Over You|i m waitin for yo...|
|  ABBA|               Dance|oh my love it mak...|
|  ABBA|       Dancing Queen|you can danc you ...|
|  ABBA|         Disillusion|chang move in a c...|
|  ABBA|Does Your Mother ...|you re so hot tea...|
|  ABBA|         Dream World|agnetha we re not...|
|  ABBA|      Dum Dum Diddle|i can hear how yo...|
|  ABBA|               Eagle|th

In [126]:
# Show the DataFrame with the array column

df.select("text").show()

+--------------------+
|                text|
+--------------------+
|[look at her face...|
|[take it easi wit...|
|[i ll never know ...|
|[make somebodi ha...|
|[make somebodi ha...|
|[well you hoot an...|
|[down in the stre...|
|[chiquitita tell ...|
|[i wa out with th...|
|[i m waitin for y...|
|[oh my love it ma...|
|[you can danc you...|
|[chang move in a ...|
|[you re so hot te...|
|[agnetha we re no...|
|[i can hear how y...|
|[they came fli fr...|
|[everi good man n...|
|[can you hear the...|
|[pued escuchar fe...|
+--------------------+
only showing top 20 rows



In [127]:
pandas_df=df.toPandas() # Converting to pandas Dataframe

In [128]:
pandas_df

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,[look at her face it s a wonder face and it me...
1,ABBA,"Andante, Andante",[take it easi with me pleas touch me gentli li...
2,ABBA,As Good As New,[i ll never know whi i had to go whi i had to ...
3,ABBA,Bang,[make somebodi happi is a question of give and...
4,ABBA,Bang-A-Boomerang,[make somebodi happi is a question of give and...
...,...,...,...
4995,Elvis Costello,Love For Tender,[you won t take my love for tender you can put...
4996,Elvis Costello,Love Went Mad,[i ve look at it everi way i can from under an...
4997,Elvis Costello,Lover's Walk,[i won t walk with my head bow be on beyond ca...
4998,Elvis Costello,Luxembourg,[dress up like a dog s dinner butter wouldn t ...


In [130]:
# Removing common english stop words from the text and doing analysis at word level
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(pandas_df['text'])     # Transforming text into TF_IDF matrix


In [131]:
# Applying matrix to calculated cosine similairity between the text
similarity = cosine_similarity(matrix)
similarity

array([[1.        , 0.03571458, 0.01274887, ..., 0.06065868, 0.05687335,
        0.13438459],
       [0.03571458, 1.        , 0.00759337, ..., 0.00483217, 0.01919634,
        0.0034971 ],
       [0.01274887, 0.00759337, 1.        , ..., 0.00726643, 0.00871825,
        0.03042829],
       ...,
       [0.06065868, 0.00483217, 0.00726643, ..., 1.        , 0.0582164 ,
        0.01661463],
       [0.05687335, 0.01919634, 0.00871825, ..., 0.0582164 , 1.        ,
        0.08982805],
       [0.13438459, 0.0034971 , 0.03042829, ..., 0.01661463, 0.08982805,
        1.        ]])

## Recommendation Function

In [132]:
def recommendation(song_df):
    # Find the index of the given song in the DataFrame
    idx = pandas_df[pandas_df['song'] == song_df].index[0]

    # Calculate distances and sort them in descending order
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])

    # Create a list to store recommended songs
    songs = []

    # Iterate through distances and retrieve recommended songs
    for m_id in distances[1:21]:
        songs.append(pandas_df.iloc[m_id[0]].song)

    # Return the list of recommended songs
    return songs

In [133]:
## Demo inputs
''' 'Endless Love',
 'Boom Boom Boom',
 'Easy Ride',
 "Boom, Boom, Ain't It Great To Be Crazy?",
 'Bang-Bang',
 'Boom Boom',
 'History Is Made By Stupid People',
 'Bang A Drum',
 'How To Be Dumb',
 'Love You Till Tuesday',
 'Pop Style',
 'The Ants Go Marching',
 "I'll Kiss You",
 'X Marks The Spot' '''


' \'Endless Love\',\n \'Boom Boom Boom\',\n \'Easy Ride\',\n "Boom, Boom, Ain\'t It Great To Be Crazy?",\n \'Bang-Bang\',\n \'Boom Boom\',\n \'History Is Made By Stupid People\',\n \'Bang A Drum\',\n \'How To Be Dumb\',\n \'Love You Till Tuesday\',\n \'Pop Style\',\n \'The Ants Go Marching\',\n "I\'ll Kiss You",\n \'X Marks The Spot\' '

In [134]:
recommendation('Love You Till Tuesday')  # calling Function

['Be With Me',
 'No Fortune',
 'Where We Land',
 'Learn To Croon',
 'Games People Play',
 'When The Cows Come Home',
 'A Casual Look',
 'Rolling Down Da Street',
 'Best I Ever Had',
 'The Prime Of Your Love',
 'I Just Got My Pay',
 'Last Time I Saw Him',
 'I Love You',
 'See All Her Faces',
 'Perhaps, Perhaps',
 'Love To Love You Baby',
 'Give Up',
 "I Can't Wait",
 'I Love Her I Love Her',
 'Fool Til The End']