In [1]:
import numpy as np
import pandas as pd

import os
import sys

In [2]:
def reduce_mem_usage(df, verbose=True):
    """ Function iterates through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        Credit to: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
        Parameters
        ----------
        df : Pandas DataFrame
        verbose: (True) by default, prints out before and after memory usage
        Returns
        -------
        df : Reduced Memory Pandas DataFrame
    """

    if verbose:
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    if verbose:
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(
            100 * (start_mem - end_mem) / start_mem))

    return df


def load_dataset():
    """Loads dataset, and combines them into a single dataframe."""
    try:
        fpaths = ['./data/spotify_2018.csv',
                  './data/spotify_2019.csv',
                  './data/spotify_2020.csv']
        df1 = pd.read_csv(fpaths[0], encoding = 'latin1')
        df2 = pd.read_csv(fpaths[1], encoding = 'latin1')
        df3 = pd.read_csv(fpaths[2], encoding = 'latin1')
        df = pd.concat([df1, df2, df3], ignore_index=True)
        df = reduce_mem_usage(df)  # Reduces Memory Usage

    except Exception as e:
        print(
            'Error Occurred while reading the Spotify dataset: {e}'.format(e))
        raise e

    # Check total sum of rows match
    assert df.shape[0] == (df1.shape[0] + df2.shape[0] + df3.shape[0])

    file_names = [fpath.split('/')[-1] for fpath in fpaths]
    print(
        f'''
        -------------------- SHAPE ---------------------
        DF1 {file_names[0]}: {df1.shape}
        DF2 {file_names[1]}: {df2.shape}
        DF3 {file_names[2]}: {df3.shape}
        MERGED DF df: {df.shape}
        ------------------------------------------------
        ''')

    return df


def wrangle(df):

    # Set to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Drop Duplicates
    df = df.drop_duplicates(keep='first')
    
    ## isolating values with a high level of 'speechiness' as they are generally not music but other types of recordings.
    ## converting unwanted values to null then removing those rows
    df['speechiness'] = df['speechiness'].where(df['speechiness'] < .66)
    
    # Drop missing values
    df = df.dropna()

    return df


df = load_dataset()
df.head()

Memory usage of dataframe is 56.38 MB
Memory usage after optimization is: 22.71 MB
Decreased by 59.7%

        -------------------- SHAPE ---------------------
        DF1 spotify_2018.csv: (158885, 18)
        DF2 spotify_2019.csv: (155645, 18)
        DF3 spotify_2020.csv: (96035, 18)
        MERGED DF df: (410565, 18)
        ------------------------------------------------
        


Unnamed: 0,artist_name,track_name,track_id,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Kina Grannis,Can't Help Falling In Love,6lfxq3CG4xtTiEg7opyCyx,72,acoustic,0.266113,0.059601,0,-18.515625,1,0.036285,0.904785,7.1e-05,0.131958,0.142944,181.75,201933,3
1,Ben Rector,Love Like This,06JmNnH3iXKENNRKifqu0v,64,acoustic,0.637207,0.129028,2,-11.890625,1,0.040497,0.902832,0.0,0.106995,0.36792,139.75,214240,4
2,Jason Mraz,Have It All,7BXW1QCg56yzEBV8pW8pah,66,acoustic,0.643066,0.638184,1,-4.945312,0,0.04071,0.203979,0.0,0.070679,0.600098,82.0,226107,4
3,Ben Rector,Old Friends,4MZQ3lHA1TYO6yyedtmBYg,61,acoustic,0.468994,0.403076,8,-10.046875,1,0.048309,0.131958,4.7e-05,0.116028,0.208008,147.375,224744,4
4,Ben Rector,I Will Always Be Yours,4m1lB7qJ78VPYsQy7RoBcU,60,acoustic,0.445068,0.770996,0,-4.605469,1,0.05011,0.133057,0.0,0.272949,0.447998,147.875,226827,4


In [3]:
# Wrangle Data
wrangled_df = wrangle(df)

print(f'After Wrangling, Shape: {wrangled_df.shape}')
wrangled_df.head()

After Wrangling, Shape: (405526, 18)


Unnamed: 0,artist_name,track_name,track_id,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Kina Grannis,Can't Help Falling In Love,6lfxq3CG4xtTiEg7opyCyx,72,acoustic,0.266113,0.059601,0,-18.515625,1,0.036285,0.904785,7.1e-05,0.131958,0.142944,181.75,201933,3
1,Ben Rector,Love Like This,06JmNnH3iXKENNRKifqu0v,64,acoustic,0.637207,0.129028,2,-11.890625,1,0.040497,0.902832,0.0,0.106995,0.36792,139.75,214240,4
2,Jason Mraz,Have It All,7BXW1QCg56yzEBV8pW8pah,66,acoustic,0.643066,0.638184,1,-4.945312,0,0.04071,0.203979,0.0,0.070679,0.600098,82.0,226107,4
3,Ben Rector,Old Friends,4MZQ3lHA1TYO6yyedtmBYg,61,acoustic,0.468994,0.403076,8,-10.046875,1,0.048309,0.131958,4.7e-05,0.116028,0.208008,147.375,224744,4
4,Ben Rector,I Will Always Be Yours,4m1lB7qJ78VPYsQy7RoBcU,60,acoustic,0.445068,0.770996,0,-4.605469,1,0.05011,0.133057,0.0,0.272949,0.447998,147.875,226827,4
