In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

In [2]:
df = pd.read_csv("Docs/Lastfm.csv", sep=';')

In [3]:
df = df.dropna(subset=["track_id", "song_name", "artist", "genre", "year"])
df = df.drop_duplicates(subset=["track_id"])

In [4]:
df_300 = df.sample(n=300, random_state=42).reset_index(drop=True)

In [5]:
# Selecionar 300 músicas únicas com times_played = 1
df_300["times_played"] = [1 for _ in range(len(df_300))]

# Selecionar 200 músicas diferentes para repetir com times_played entre 2 e 5
df_remaining = df.drop(df_300.index).reset_index(drop=True)
df_repeat = df_remaining.sample(n=200, random_state=1).copy()
df_repeat["times_played"] = [random.randint(2, 5) for _ in range(len(df_repeat))]

# Concatenar tudo
df_listened = pd.concat([df_300, df_repeat]).reset_index(drop=True)


In [6]:
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

In [7]:
start_date = datetime(2025, 10, 21)
end_date = datetime(2025, 10, 28)

df_listened["timestamp"] = [
    random_date(start_date, end_date).date().isoformat() for _ in range(len(df_listened))
]

In [8]:
user_ids = [f"U{str(i).zfill(4)}" for i in range(1, 6)]  # 5 usuários
df_listened["user_id"] = [random.choice(user_ids) for _ in range(len(df_listened))]

In [9]:
df_listened.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   track_id      500 non-null    object 
 1   song_name     500 non-null    object 
 2   artist        500 non-null    object 
 3   genre         500 non-null    object 
 4   year          500 non-null    float64
 5   times_played  500 non-null    int64  
 6   timestamp     500 non-null    object 
 7   user_id       500 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 31.4+ KB


In [10]:
df_listened.sample(10)

Unnamed: 0,track_id,song_name,artist,genre,year,times_played,timestamp,user_id
111,TRSIOWL128E0793EC0,Perversion 99,Rob Zombie,"metal, instrumental, heavy_metal, industrial, ...",1998.0,1,2025-10-21,U0001
272,TRRFCUL128F9304059,Freddie Freeloader,Miles Davis,jazz,1959.0,1,2025-10-26,U0003
122,TRWKKMZ128F1492D5B,Slide It In,Whitesnake,"hard_rock, 80s, heavy_metal",2013.0,1,2025-10-25,U0002
194,TRPYJPV128F92F1202,Tear Strips Off,Tipper,"electronic, ambient, idm",2003.0,1,2025-10-27,U0001
346,TRCCVSJ128F934B8DD,Love Me Like You,The Magic Numbers,"indie, indie_rock, indie_pop",2008.0,4,2025-10-25,U0003
195,TRILNPM128F92EBA22,Eye of the Storm,Bullet for My Valentine,"metal, heavy_metal, hardcore, thrash_metal, me...",2008.0,1,2025-10-25,U0004
492,TRXASKT128F92F2D0F,Five O'Clock World,The Vogues,"pop, soundtrack, 60s, oldies",1995.0,3,2025-10-27,U0003
58,TRVJHJC128F932DBED,Green Astronauts,A Boy Called Joni,trance,2016.0,1,2025-10-26,U0004
475,TRXIVCO128F931D863,Young Love,Chris Brown,"soul, rnb",2006.0,5,2025-10-26,U0004
43,TRQECWH128F42267FC,Fragen,Pole,electronic,1998.0,1,2025-10-22,U0002


In [11]:
df_listened["genre"] = df_listened["genre"].str.replace(",", " |")
df_listened.sample(5)

Unnamed: 0,track_id,song_name,artist,genre,year,times_played,timestamp,user_id
398,TRVWDBV12903CEACE0,City Noise,Scarling.,rock | alternative | female_vocalists | altern...,2006.0,5,2025-10-23,U0002
154,TRFXCXV128F92D17A3,Total Destruction,Bathory,black_metal | thrash_metal,1993.0,1,2025-10-28,U0001
436,TRLXHIH12903CD4E1F,Coward,Hans Zimmer,instrumental | soundtrack,2014.0,5,2025-10-28,U0002
214,TRZCNRB12903CF5D1A,Legion (Slaughterlord Cover),At the Gates,death_metal | melodic_death_metal,1995.0,1,2025-10-22,U0005
299,TRPDBAB128F4263859,Diggin' On James Brown,Tower of Power,soul | funk,1995.0,1,2025-10-21,U0001


In [12]:
df_listened.to_csv("Docs/user_listened_data.csv", index=False)