In [None]:
from google.colab import drive
import os

gdrive_path='/content/gdrive/MyDrive/Bertopic/shared_work/'

# This will mount your google drive under 'MyDrive'
drive.mount('/content/gdrive', force_remount=True)
# In order to access the files in this notebook we have to navigate to the correct folder
os.chdir(gdrive_path)
# Check manually if all files are present
print(sorted(os.listdir()))

Mounted at /content/gdrive


FileNotFoundError: [Errno 2] No such file or directory: '/content/gdrive/MyDrive/Bertopic/shared_work/'

In [None]:
# importing Songs dataset
import pandas as pd

dsongs = pd.read_csv('/content/gdrive/MyDrive/Bertopic/Wasabi/wasabi_songs.csv', sep='\t', low_memory=False)
list(dsongs.columns)

In [None]:
# Selecting only English language songs
dsongs_English = dsongs[dsongs['language_detect'] == 'english']

# Selecting columns we need for songs dataset, we can add other columns or remove current in the future
column_list_for_english_songs = ['artist', 'genre', 'language', 'language_detect', 'title', 'explicit_content_lyrics_predicted']
dsongs_English = dsongs_English[column_list_for_english_songs]
print(len(dsongs_English))

# Saving this file in Datasets folder
dsongs_English.to_csv('/content/gdrive/MyDrive/Bertopic/shared_work/wasabi_english_songs.csv')

In [None]:
# Imporint artists dataset
import pandas as pd

dartists = pd.read_csv('/content/gdrive/MyDrive/Bertopic/Wasabi/wasabi_artists.csv', low_memory=False)
list(dartists.columns)

In [None]:
# Removing entries where Gender is not Male, Female or Other
dartists_cleaned = dartists.dropna(subset=['gender'])
dartists_cleaned[["gender"]].value_counts()

In [None]:
# Removing entries where members column in artist is empty []
dartists_cleaned_solo = dartists_cleaned[dartists_cleaned['members'] == '[]']

# Selecting columns we need for artists dataset, we can add other columns or remove current in the future
column_list_for_dartists = ['name', 'gender']
dartists_cleaned_solo = dartists_cleaned_solo[column_list_for_dartists]

In [None]:
# Checking new artist dataset where gender and solo artists filtered and chosen
dartists_cleaned_solo[["gender"]].value_counts()

In [None]:
# Choosing artist name from songs dataset with english language and removing the other ones
# (the ones that does not sing in english, because in artist it is not specified fully in what language artist sings, we went forward from Songs to Artists)
idxs = list(dsongs_English.artist.values)

dartists_cleaned_solo = dartists_cleaned_solo[pd.Series(list(dartists_cleaned_solo.name), index=dartists_cleaned_solo.index).isin(idxs)]

In [None]:
# Checking new artist dataset where english singing artists are kept
dartists_cleaned_solo[["gender"]].value_counts()

In [None]:
# Saving the file for artists datasrt
dartists_cleaned_solo.to_csv('/content/gdrive/MyDrive/Bertopic/shared_work/wasabi_artist_new_dartists.csv')

In [None]:
df_male = dartists_cleaned_solo[dartists_cleaned_solo['gender'] == 'Male']
df_female = dartists_cleaned_solo[dartists_cleaned_solo['gender'] == 'Female']

# Determine the smaller size among the two groups
min_size = min(len(df_male), len(df_female))

# Truncate both datasets to have the same size
df_male_balanced = df_male.sample(min_size)
df_female_balanced = df_female.sample(min_size)

# Combine the balanced datasets
df_balanced = pd.concat([df_male_balanced, df_female_balanced])
df_balanced.to_csv('/content/gdrive/MyDrive/Bertopic/shared_work/wasabi_new_dartists_balanced.csv')

In [None]:
df_balanced[["gender"]].value_counts()

In [None]:
print(len(dsongs_English))

In [None]:
# Again removing songs which does not belong to artists in wasabi_new_dartists_balanced.csv file
idxs2 = list(df_balanced.name.values)

dsongs2_English = dsongs_English[pd.Series(list(dsongs_English.artist), index=dsongs_English.index).isin(idxs2)]

In [None]:
print(len(dsongs2_English))

In [None]:
dsongs2_English.to_csv('/content/gdrive/MyDrive/Bertopic/shared_work/wasabi_new_songs_english_balanced_artists.csv')

In [None]:
import requests

class MusixMatch:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "http://api.musixmatch.com/ws/1.1/"

    def _make_request(self, method, params):
        params['apikey'] = self.api_key
        response = requests.get(self.base_url + method, params=params)
        response.raise_for_status()
        return response.json()

    def search_track(self, track_name):
        method = 'track.search'
        params = {'q_track': track_name, 'page_size': 1, 'page': 1, 's_track_rating': 'desc'}
        data = self._make_request(method, params)
        track_list = data['message']['body']['track_list']
        return track_list[0]['track']['track_id'] if track_list else None

    def get_lyrics(self, track_id):
        method = 'track.lyrics.get'
        params = {'track_id': track_id}
        data = self._make_request(method, params)
        return data['message']['body']['lyrics']['lyrics_body'] if data['message']['body'] else None

api_key = 'da10a106601943f9899e0695231a139d		'
musixmatch = MusixMatch(api_key)

# Search for a track and get its lyrics
track_name = "Shape of You"
track_id = musixmatch.search_track(track_name)
if track_id:
    lyrics = musixmatch.get_lyrics(track_id)
    print("Lyrics:\n", lyrics)
else:
    print("Track not found.")

# get the lyrics

In [None]:
import pandas as pd

def append_lyrics_to_csv(csv_file_path, api_key, start_row=0):
    musixmatch = MusixMatch(api_key)
    df = pd.read_csv(csv_file_path)

    # Add a 'lyrics_addbyus' column if it doesn't exist
    if 'lyrics_addbyus' not in df.columns:
        df['lyrics_addbyus'] = pd.NA

    for i, row in df.iterrows():
        if i < start_row:
            continue  # Skip rows until the starting row is reached

        # Check if lyrics are already present
        if pd.isna(df.at[i, 'lyrics_addbyus']):
            print(i)
            track_name = row['title']  # Replace 'title' with your actual column name if different
            track_id = musixmatch.search_track(track_name)
            if track_id:
                lyrics = musixmatch.get_lyrics(track_id)
                df.at[i, 'lyrics_addbyus'] = lyrics  # Add lyrics to the DataFrame
            else:
                df.at[i, 'lyrics_addbyus'] = "Lyrics Not Found"

            # Save progress after each update directly to the same CSV file
            df.to_csv(csv_file_path, index=False)

# Example usage
csv_file_path = '/content/drive/MyDrive/Praktikum - NLP Applications/Datasets/wasabi_new_songs_english_balanced_artists.csv'
api_key = 'da10a106601943f9899e0695231a139d'  # Your Musixmatch API key
start_row = 8276  # Set the row number from which you want to start fetching lyrics, change it everytime you ran the code
append_lyrics_to_csv(csv_file_path, api_key, start_row)