# Spotify  Youtube Data Cleaning



In [197]:
# standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [198]:
!ls

Spotify_Youtuben.csv
Spotify_youtube_preprocessing.ipynb
cleaned_dataset.csv


In [199]:
# loading the data

data = pd.read_csv("Spotify_Youtuben.csv")
data.head()

Unnamed: 0,Artist,Url_spotify,Track,Album,Album_type,Uri,Danceability,Energy,Key,Loudness,...,Url_youtube,Title,Channel,Views,Likes,Comments,Description,Licensed,official_video,Stream
0,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Feel Good Inc.,Demon Days,album,spotify:track:0d28khcov6AiegSCpG5TuT,0.818,0.705,6.0,-6.679,...,https://www.youtube.com/watch?v=HyHNuVaZJ-k,Gorillaz - Feel Good Inc. (Official Video),Gorillaz,693555221.0,6220896.0,169907.0,Official HD Video for Gorillaz' fantastic trac...,True,True,1040235000.0
1,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Rhinestone Eyes,Plastic Beach,album,spotify:track:1foMv2HQwfQ2vntFf9HFeG,0.676,0.703,8.0,-5.815,...,https://www.youtube.com/watch?v=yYDmaexVHic,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,Gorillaz,72011645.0,1079128.0,31003.0,The official video for Gorillaz - Rhinestone E...,True,True,310083700.0
2,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,spotify:track:64dLd6rVqDLtkXFYrEUHIU,0.695,0.923,1.0,-3.93,...,https://www.youtube.com/watch?v=qJa-VFwPpYA,Gorillaz - New Gold ft. Tame Impala & Bootie B...,Gorillaz,8435055.0,282142.0,7399.0,Gorillaz - New Gold ft. Tame Impala & Bootie B...,True,True,63063470.0
3,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,On Melancholy Hill,Plastic Beach,album,spotify:track:0q6LuUqGLUiCPP1cbdwFs3,0.689,0.739,2.0,-5.81,...,https://www.youtube.com/watch?v=04mfKJWDSzI,Gorillaz - On Melancholy Hill (Official Video),Gorillaz,211754952.0,1788577.0,55229.0,Follow Gorillaz online:\nhttp://gorillaz.com \...,True,True,434663600.0
4,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Clint Eastwood,Gorillaz,album,spotify:track:7yMiX7n9SBvadzox8T5jzT,0.663,0.694,10.0,-8.627,...,https://www.youtube.com/watch?v=1V_xRb0x9aw,Gorillaz - Clint Eastwood (Official Video),Gorillaz,618480958.0,6197318.0,155930.0,The official music video for Gorillaz - Clint ...,True,True,617259700.0


In [200]:
df = data.copy()

In [201]:
# info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20723 entries, 0 to 20722
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Artist            20723 non-null  object 
 1   Url_spotify       20723 non-null  object 
 2   Track             20723 non-null  object 
 3   Album             20723 non-null  object 
 4   Album_type        20723 non-null  object 
 5   Uri               20723 non-null  object 
 6   Danceability      20721 non-null  float64
 7   Energy            20721 non-null  float64
 8   Key               20721 non-null  float64
 9   Loudness          20721 non-null  float64
 10  Speechiness       20721 non-null  float64
 11  Acousticness      20721 non-null  float64
 12  Instrumentalness  20721 non-null  float64
 13  Liveness          20721 non-null  float64
 14  Valence           20721 non-null  float64
 15  Tempo             20721 non-null  float64
 16  Duration_ms       20721 non-null  float6

Task 1: Remove columns that are not needed in our analysis.
Remove Url_spotify, Uri, Key, Url_youtube, Description

In [202]:
df = df.drop(columns=["Url_spotify", "Uri", "Key", "Url_youtube", "Description"])

In [203]:
df.columns

Index(['Artist', 'Track', 'Album', 'Album_type', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Duration_ms', 'Title', 'Channel',
       'Views', 'Likes', 'Comments', 'Licensed', 'official_video', 'Stream'],
      dtype='object')

Task 2: Check for the null values

In [204]:
df.isna().sum()

Artist                0
Track                 0
Album                 0
Album_type            0
Danceability          2
Energy                2
Loudness              2
Speechiness           2
Acousticness          2
Instrumentalness      2
Liveness              2
Valence               2
Tempo                 2
Duration_ms           2
Title               470
Channel             470
Views               470
Likes               541
Comments            569
Licensed            470
official_video      470
Stream              577
dtype: int64

Task 3: Handle the null values replace int value with 0 and other values with NA

In [205]:
obj = [i for i in df.select_dtypes(include="object")]
int_col = [i for i in df.select_dtypes(include=["int", "float64"])]

df[int_col] = df[int_col].fillna(0)
df[obj] = df[obj].fillna("NA")


Task 4: CHECK FOR DUPLICATES AND REMOVE THEM KEEPING THE FIRST VALUE

In [206]:
df.duplicated().sum()

5

In [207]:
df = df.drop_duplicates(keep="first")

In [208]:
df.duplicated().sum()

0

Task 5: CONVERT millisecond duration to minute for a better understanding

In [209]:
(df["Duration_ms"][1] / 60000).round(2)

3.34

In [210]:
df["Duration_ms"] = df["Duration_ms"].apply(lambda x: x / 60000)

Task 6: Rename the modified column to Duration_min

In [211]:
df = df.rename(columns={"Duration_ms": "Duration_min"})

In [212]:
df["Duration_min"].max()

77.9343

In [213]:
df["Duration_min"].min()

0.0

Remove irrelevant 'Track' name that starts with ?

In [214]:
idx = df[df["Track"].str.startswith("?")].index.to_list()
df = df.drop(idx)

Task 8 : Calculate the Energy to Liveness ratio for each track and store it in columns 'EnergyLiveness'

In [215]:
df.loc[0, ["Energy", "Liveness"]]

Energy      0.705
Liveness    0.613
Name: 0, dtype: object

In [216]:
df["EnergyLiveness"] = df["Energy"] / df["Liveness"]

#Task 9: change the datatype of 'views' to float for further use

In [217]:
df["Views"] = df["Views"].astype("float")

Task 10: compare the views and stream columns to infer
that the song track was more played on which platform, youtube or Spotify.
Create a column named most_playedon which will have two values.
Spotify and Youtube,If a song track is most played on youtube then
the most_played on column will have youtube as the value for that particular song

In [218]:
df["most_playedon"] = np.where(df.Views >= df.Stream, "youtube", "Spotify")

In [219]:
df.most_playedon.value_counts()

most_playedon
Spotify    15694
youtube     4900
Name: count, dtype: int64

In [221]:
df.to_csv("cleaned_dataset.csv", encoding="utf-8", index=False)