In [1]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import csv
from sklearn.model_selection import train_test_split   # Import the train_test_split function

In [2]:
# Open the first CSV file for reading
with open('split_csv/output_file1.csv', 'r', newline='') as file1:
    reader1 = csv.reader(file1)
    data1 = list(reader1)

# Open the second CSV file for reading
with open('split_csv/output_file2.csv', 'r', newline='') as file2:
    reader2 = csv.reader(file2)
    data2 = list(reader2)

# Combine the data from both files
combined_data = data1 + data2

# Create a new CSV file to write the combined data
with open('merged_data.csv', 'w', newline='') as output_file:
    writer = csv.writer(output_file)
    writer.writerows(combined_data)
df = pd.read_csv('merged_data.csv')
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [4]:
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)
df.head(5)

Unnamed: 0,artist,song,text
0,Morrissey,Hold On To Your Friends,A bond of trust \r\nHas been abused \r\nSome...
1,Alabama,I Showed Her,She told me I should straighten up my ways or ...
2,Nitty Gritty Dirt Band,Cupid's Got A Gun,So many lonely people looking for a way \r\nT...
3,"Harry Connick, Jr.",She Belongs To Me,I met her... But I never thought \r\nShe'd be...
4,Jimmy Buffett,Great Heart,This world is full of strange behavior \r\nEv...


In [5]:
df['text'] = df['text'].str.lower().replace(r'^a-ZA-Z0-9','').replace(r'\n','',regex=True)

In [6]:
df.head(5)

Unnamed: 0,artist,song,text
0,Morrissey,Hold On To Your Friends,a bond of trust \rhas been abused \rsomethin...
1,Alabama,I Showed Her,she told me i should straighten up my ways or ...
2,Nitty Gritty Dirt Band,Cupid's Got A Gun,so many lonely people looking for a way \rto ...
3,"Harry Connick, Jr.",She Belongs To Me,i met her... but i never thought \rshe'd belo...
4,Jimmy Buffett,Great Heart,this world is full of strange behavior \rever...


In [7]:
stemmer = PorterStemmer()

In [8]:
def token(x):
    token = nltk.word_tokenize(x)
    y = [stemmer.stem(i) for i in token]
    return " ".join(y)

In [9]:
df['text'] = df['text'].apply(lambda x: token(x))

In [10]:
tfid = TfidfVectorizer(analyzer='word',stop_words='english')

In [11]:
matrix = tfid.fit_transform(df['text'])

In [12]:
similarity = cosine_similarity(matrix)

In [13]:
df[df['song']=='Hold On To Your Friends'].index[0]

0

In [14]:
def recommender(song):
    idx = df[df['song']==song].index[0]
    distance = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x:x[1])
    song= []
    for song_id in distance[1:5]:
        song.append(df.iloc[song_id[0]].song)
    return song

In [15]:
recommender("She Belongs To Me")

['Belong To The World', 'She Wants You Back', 'Mama', 'Cry No More']

In [None]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.dump(df, open('df.pkl', 'wb'))