In [1]:
import pandas as pd

# Step 1: Load the Datasets
d2 = pd.read_csv("1990sClassicHits.csv")
d1 = pd.read_csv("recognition_by_generation.csv")

# Step 2: Clean the Data
# Forward fill and drop duplicates
d2 = d2.ffill().drop_duplicates()
d1 = d1.ffill().drop_duplicates()

# Step 3: Prepare for Merging
# Standardize column names
d1.rename(columns={'artist': 'Artist', 'song': 'Track'}, inplace=True)

# Select relevant columns from d2, including Duration
d2 = d2[['Track', 'Artist', 'Duration', 'Danceability', 'Energy', 'Tempo', 'Popularity']]

# Step 4: Merge the Datasets
# Merge on 'Artist' and 'Track' and keep only matching songs (inner join)
d3 = pd.merge(d1, d2, on=['Artist', 'Track'], how='inner')

# Step 5: Final Clean-up
# Drop any remaining duplicates just in case
d3 = d3.drop_duplicates()

# Show the result of the merged DataFrame (d3), including the Duration column
print(d3.head())
print(d3.info())

              Artist                    Track  mean_millennial_recognition  \
0                702       Where My Girls At?                     0.553481   
1       Adina Howard            Freak Like Me                     0.207478   
2  Alanis Morissette                   Ironic                     0.887612   
3      Alannah Myles             Black Velvet                     0.621799   
4              Alias  More Than Words Can Say                     0.075031   

   mean_gen_z_recognition Duration  Danceability  Energy    Tempo  Popularity  
0                0.244176     2:46         0.705   0.725   92.186          60  
1                0.139892     4:13         0.655   0.837   91.014          58  
2                0.587672     3:50         0.408   0.582  114.926          75  
3                0.395484     4:47         0.750   0.366   91.138          72  
4                0.107534     3:54         0.412   0.376   69.064          50  
<class 'pandas.core.frame.DataFrame'>
Index: 199 en