# Covert Additional Files

The Million Song Dataset includes various other files that are not part of the standard dataset. These files are in a range of formats such as .txt, .csv, .db, .h5 and are all lumped together even though they have different schemas. This notebook convers their format to csv and loads them into our Intermediate Data Lake in their own subdirectory.

## Load Libaries & Set Variables

In [2]:
import boto3
import pandas as pd
import sqlite3
import os

source_bucket = 'millionsongdataset-raw'
source_prefix = 'AdditionalFiles'
destination_bucket = 'millionsongdataset-intermediate'

s3 = boto3.client('s3')

## Artist Location Data

In [15]:
artist_location_columns = ['artist_id', 'artist_latitude', 'artist_longitude', 'artist_name', 'artist_location']

artist_location = pd.read_csv(f's3://{source_bucket}/{source_prefix}/artist_location.txt', names = artist_location_columns, sep = "<SEP>", header = None, engine='python')

artist_location.to_csv(f's3://{destination_bucket}/artist_location/artist_location.csv', index = False)

## Tracks Per Year

In [18]:
tracks_per_year = pd.read_csv(f's3://{source_bucket}/{source_prefix}/tracks_per_year.txt', names = ['year', 'track_id', 'artist_name', 'track_name'], sep = "<SEP>", header = None, engine='python')

tracks_per_year.to_csv(f's3://{destination_bucket}/tracks_per_year/tracks_per_year.csv', index = False)

## Unique Artists

In [21]:
unique_artists = pd.read_csv(f's3://{source_bucket}/{source_prefix}/unique_artists.txt', names = ['artist_id', 'artist_mbid','track_id','artist_name'], sep = "<SEP>", header = None, engine='python')

unique_artists.to_csv(f's3://{destination_bucket}/unique_artists/unique_artists.csv', index = False)

## Unique MB Tags

In [24]:
unique_mb_tags = pd.read_csv(f's3://{source_bucket}/{source_prefix}/unique_mbtags.txt', names = ['tag'], sep = "<SEP>", header = None, engine='python')

unique_mb_tags.to_csv(f's3://{destination_bucket}/unique_mbtags/unique_mbtags.csv', index = False)

## Unique Terms

In [26]:
unique_terms = pd.read_csv(f's3://{source_bucket}/{source_prefix}/unique_terms.txt', names = ['term'], sep = "<SEP>", header = None, engine='python')

unique_terms.to_csv(f's3://{destination_bucket}/unique_terms/unique_terms.csv', index = False)

## Unique Tracks

In [28]:
unique_tracks = pd.read_csv(f's3://{source_bucket}/{source_prefix}/unique_tracks.txt', names = ['track_id', 'song_id', 'artist_name', 'song_title'], sep = "<SEP>", header = None, engine='python')

unique_tracks.to_csv(f's3://{destination_bucket}/unique_tracks/unique_tracks.csv', index = False)

## Artist Similarity

In [6]:
s3.download_file(source_bucket, f'{source_prefix}/artist_similarity.db', 'artist_similarity.db')

conn = sqlite3.connect('artist_similarity.db')

artist_similarity = pd.read_sql_query("SELECT * FROM similarity", conn)

artist_similarity.to_csv(f's3://{destination_bucket}/artist_similarity/artist_similarity.csv', index = False)

conn.close()

os.remove('artist_similarity.db')

## Artist Term

In [14]:
s3.download_file(source_bucket, f'{source_prefix}/artist_term.db', 'artist_term.db')

conn = sqlite3.connect('artist_term.db')

artist_mbtag = pd.read_sql_query("SELECT * FROM artist_mbtag", conn)

artist_mbtag.to_csv(f's3://{destination_bucket}/artist_mbtag/artist_mbtag.csv', index = False)

artist_term = pd.read_sql_query("SELECT * FROM artist_term", conn)

artist_term.to_csv(f's3://{destination_bucket}/artist_term/artist_term.csv', index = False)

conn.close()

os.remove('artist_term.db')


## Track MetaData

In [15]:
s3.download_file(source_bucket, f'{source_prefix}/track_metadata.db', 'track_metadata.db')

conn = sqlite3.connect('track_metadata.db')

track_metadata = pd.read_sql_query("SELECT * FROM songs", conn)

track_metadata.to_csv(f's3://{destination_bucket}/track_metadata/track_metadata.csv', index = False)

conn.close()

os.remove('track_metadata.db')

## Lyrics

In [3]:
s3.download_file('millionsongdataset-associateddatasets', 'mxm_dataset.db', 'mxm_dataset.db')

conn = sqlite3.connect('mxm_dataset.db')

lyrics = pd.read_sql_query("SELECT * FROM lyrics", conn)

lyrics.to_csv(f's3://{destination_bucket}/lyrics/lyrics.csv', index = False)

conn.close()

os.remove('mxm_dataset.db')