# Load data

In [1]:
import os
import pandas as pd
import sqlite3

import matplotlib.pyplot as plt

def has_swear_word(text):
    if text == None:
        return False
    
    words = text.split()

    for w in words:
        for sw in swear_words:
            if sw.lower() == w.lower():
                return True
    return False

#Get swear word list
with open("swear_word_list.txt", 'r') as f:
    swear_words = [line.strip() for line in f]

con = sqlite3.connect("data/database.db")

track_df = pd.read_sql("SELECT * FROM Track", con=con)
artist_df = pd.read_sql("SELECT * from Artist", con=con)
album_df = pd.read_sql("SELECT * FROM Album", con=con)

# Drop null or empty values

In [2]:
track_df.replace("", float("NaN"), inplace=True)
track_df.dropna(inplace=True)

# Handle escaped characters

In [3]:
track_df["lyrics"].replace("\'", "'", regex=True, inplace=True)
track_df["lyrics"]

0        Music make you lose control, music make you lo...
1        Baby, can't you see I'm calling?\nA guy like y...
2        Don't be so quick to walk away\nDance with me\...
3        Yo, man\nYo\nOpen up, man\nYo, what you want, ...
4        Peace up, A-Town down\nYeah, yeah! Okay, okay!...
                               ...                        
12823    7/1\nAmerican Authors - What We Live For - 12/...
12824    Segundo toque\nE eu não vou, Amor\nFaço gala\n...
12825    You know my soul\nI pay no mind\nNo mind at al...
12826    Oh, see the storm is threatening\nMy very life...
12827    Drifting out\nWith the breeze\nHoping to\nJust...
Name: lyrics, Length: 12828, dtype: object

# Remove duplicate lyrics track entries

In [4]:
track_df.drop_duplicates(subset="lyrics", keep=False, inplace=True);
track_df["lyrics"]

0        Music make you lose control, music make you lo...
1        Baby, can't you see I'm calling?\nA guy like y...
2        Don't be so quick to walk away\nDance with me\...
3        Yo, man\nYo\nOpen up, man\nYo, what you want, ...
4        Peace up, A-Town down\nYeah, yeah! Okay, okay!...
                               ...                        
12823    7/1\nAmerican Authors - What We Live For - 12/...
12824    Segundo toque\nE eu não vou, Amor\nFaço gala\n...
12825    You know my soul\nI pay no mind\nNo mind at al...
12826    Oh, see the storm is threatening\nMy very life...
12827    Drifting out\nWith the breeze\nHoping to\nJust...
Name: lyrics, Length: 12828, dtype: object

# Tag explicit tracks and artists

In [5]:
artist_df['explicit'] = artist_df['name'].apply(lambda x: has_swear_word(x))

album_df['explicit'] = album_df['name'].apply(lambda x: has_swear_word(x))

track_df['explicitname'] = track_df['name'].apply(lambda x: has_swear_word(x))
track_df['explicittrack'] = track_df['lyrics'].apply(lambda x: has_swear_word(x))
track_df['explicit'] = track_df['explicitname'] | track_df['explicittrack']

track_df.drop(columns=['explicittrack', 'explicitname'], inplace=True)

# Remove artist features from track name

In [6]:
track_df["name"].replace(r" \(feat\..*\)", "", regex=True, inplace=True)
track_df["name"]

0          Lose Control
1                 Toxic
2        Rock Your Body
3          It Wasn't Me
4                 Yeah!
              ...      
12823    We Were Summer
12824           Sirumba
12825              Pish
12826     Gimme Shelter
12827    Across the Sea
Name: name, Length: 12828, dtype: object

# Save changes

In [12]:
from sqlalchemy import select, delete, Table, MetaData

con.execute("DELETE from track")
con.execute("DELETE from artist")
con.execute("DELETE from album")



track_df.to_sql("track", con=con, if_exists='replace', index=False)
artist_df.to_sql("artist", con=con, if_exists='replace', index=False)
album_df.to_sql("album", con=con, if_exists='replace', index=False)

con.execute("PRAGMA foreign_keys = OFF;")

# Remove TrackArtist entries for non-existing tracks
con.execute(
    """
    DELETE FROM track_artist_through
    WHERE track_id NOT IN (
        SELECT id
        FROM track
    );
    """
)

# Remove AlbumArtist entries for non-existing albums
con.execute(
    """
    DELETE FROM album_artist_through
    WHERE album_id NOT IN (
        SELECT id
        FROM album
    );
    """
)

# Remove AlbumTrack entries for non-existing tracks
con.execute(
    """
    DELETE FROM track_album_through
    WHERE track_id NOT IN (
        SELECT id
        FROM track
    );
    """
)

# Remove Album entries for non-existing album tracks
con.execute(
    """
    DELETE FROM album
    WHERE id NOT IN (
        SELECT DISTINCT album_id
        FROM track_album_through
    );
    """
)

# Remove Artist entries for non-existing (track or album) artists
con.execute(
    """
    DELETE FROM artist
    WHERE id NOT IN (
        SELECT DISTINCT artist_id
        FROM track_artist_through 
        UNION
        SELECT DISTINCT artist_id
        FROM album_artist_through
    );
    """
)  

# Close connection
con.close()