In [1]:
import pandas as pd

# reading text-processed data
df=pd.read_csv("https://raw.githubusercontent.com/Alex-Mak-MCW/SpotifyDataScienceProject/main/Data/NLP_processed_text.csv", encoding='utf-8')

# df=pd.read_csv("https://raw.githubusercontent.com/Alex-Mak-MCW/SpotifyDataScienceProject/main/Data/text_processed_data.csv", encoding='utf-8')

In [2]:
df

Unnamed: 0,track_name,artist,album,release_date,duration,popularity,explicit,lyrics,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,mood
0,Blinding Lights,The Weeknd,After Hours,2020-03-20,200040,90,0,Yeah\nI've been tryna call\nI've been on my ow...,0.514,0.730,...,-5.934,1.0,0.0598,0.00146,0.000095,0.0897,0.334,171.005,4.0,0
1,Shape of You,Ed Sheeran,÷ (Deluxe),2017-03-03,233712,86,0,A club isn't the best place to find a lover\nS...,0.825,0.652,...,-3.183,0.0,0.0802,0.58100,0.000000,0.0931,0.931,95.977,4.0,1
2,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,2019-05-17,182160,89,0,"I'm going under, and this time, I fear there's...",0.501,0.405,...,-5.679,1.0,0.0319,0.75100,0.000000,0.1050,0.446,109.891,4.0,0
3,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,2019-09-06,157560,85,0,"Ayy, ayy, ayy, ayy Ooh\nOoh, ooh, ooh, ooh Ooh...",0.755,0.522,...,-4.368,1.0,0.0575,0.53300,0.000000,0.0685,0.925,89.960,4.0,1
4,Starboy,The Weeknd,Starboy,2016-11-25,230453,91,1,"Ayy\nI'm tryna put you in the worst mood, ah\n...",0.679,0.587,...,-7.015,1.0,0.2760,0.14100,0.000006,0.1370,0.486,186.003,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020,When Love Takes Over (feat. Kelly Rowland),David Guetta,One More Love,2010-11-22,191000,74,0,It's complicated... it always is\nThat's just ...,0.675,0.862,...,-4.614,1.0,0.0253,0.01650,0.000427,0.1690,0.498,129.967,4.0,0
2021,Happiest Year,Jaymes Young,Feel Something (Deluxe),2022-01-28,228442,74,0,I'm really on the ropes this time\nI've been f...,0.502,0.168,...,-14.061,1.0,0.0486,0.88500,0.000000,0.1110,0.355,112.881,3.0,0
2022,Smile (with The Weeknd),Juice WRLD,Legends Never Die,2020-07-10,196180,69,1,Juice WRLD - Smile with The Weeknd\nJoji & Dip...,0.694,0.685,...,-6.535,0.0,0.1330,0.22400,0.000000,0.1240,0.306,158.831,4.0,0
2023,Saved,Khalid,American Teen,2017-04-27,206533,64,0,"2, 3, 4\nThe hard part always seems to last fo...",0.739,0.448,...,-10.280,0.0,0.1380,0.18900,0.000000,0.1180,0.553,81.044,4.0,1


## FE 1: Count-vectorizer + TF-IDF

In [3]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

testing=df.copy()

# Custom transformer to select text column
get_text_data = FunctionTransformer(lambda x: x['lyrics'], validate=False)

# # Pipeline for text processing
text_pipeline = Pipeline([
    ('selector', get_text_data),
    ('features', FeatureUnion([
        ('tfidf', Pipeline([
            ('count', CountVectorizer()),
            ('tfidf', TfidfTransformer())
        ]))
    ]))
])

# Transform text data
text_features = text_pipeline.fit_transform(testing)


In [4]:
# Get feature names from TF-IDF
tfidf_vectorizer = text_pipeline.named_steps['features'].transformer_list[0][1].named_steps['count']
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

tfidf_feature_names

array(['000', '001', '004', ..., '희미해진', '힘들', 'ﬁne'], dtype=object)

In [5]:
# Access CountVectorizer
count_vectorizer = text_pipeline.named_steps['features'].transformer_list[0][1].named_steps['count']
print(count_vectorizer.get_feature_names_out())

['000' '001' '004' ... '희미해진' '힘들' 'ﬁne']


In [6]:
# Reset index of both DataFrames
df.reset_index(drop=True, inplace=True)
# Set feature names as column names for text_features_df
text_features_df = pd.DataFrame(text_features.toarray(), columns=tfidf_feature_names)

# Perform a left merge to align rows properly
merged_df = pd.merge(df, text_features_df, left_index=True, right_index=True, how='left', suffixes=('', '_text'))


print(merged_df)

                                         track_name           artist  \
0                                   Blinding Lights       The Weeknd   
1                                      Shape of You       Ed Sheeran   
2                                 Someone You Loved    Lewis Capaldi   
3     Sunflower - Spider-Man: Into the Spider-Verse      Post Malone   
4                                           Starboy       The Weeknd   
...                                             ...              ...   
2020     When Love Takes Over (feat. Kelly Rowland)     David Guetta   
2021                                  Happiest Year     Jaymes Young   
2022                        Smile (with The Weeknd)       Juice WRLD   
2023                                          Saved           Khalid   
2024                                         A-Punk  Vampire Weekend   

                                        album release_date  duration  \
0                                 After Hours   2020-03-20    2