In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

In [7]:
df = pd.read_csv("Docs/Lastfm.csv", sep=';')

In [8]:
df = df.dropna(subset=["track_id", "song_name", "artist", "genre", "year"])
df = df.drop_duplicates(subset=["track_id"])

In [9]:
df_300 = df.sample(n=300, random_state=42).reset_index(drop=True)

In [11]:
# Selecionar 300 músicas únicas com times_played = 1
df_300["times_played"] = [1 for _ in range(len(df_300))]

# Selecionar 200 músicas diferentes para repetir com times_played entre 2 e 5
df_remaining = df.drop(df_300.index).reset_index(drop=True)
df_repeat = df_remaining.sample(n=200, random_state=1).copy()
df_repeat["times_played"] = [random.randint(2, 5) for _ in range(len(df_repeat))]

# Concatenar tudo
df_listened = pd.concat([df_300, df_repeat]).reset_index(drop=True)


In [12]:
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

In [13]:
start_date = datetime(2025, 10, 21)
end_date = datetime(2025, 10, 28)

df_listened["timestamp"] = [
    random_date(start_date, end_date).date().isoformat() for _ in range(len(df_listened))
]

In [14]:
user_ids = [f"U{str(i).zfill(4)}" for i in range(1, 6)]  # 5 usuários
df_listened["user_id"] = [random.choice(user_ids) for _ in range(len(df_listened))]

In [16]:
df_listened.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   track_id      500 non-null    object 
 1   song_name     500 non-null    object 
 2   artist        500 non-null    object 
 3   genre         500 non-null    object 
 4   year          500 non-null    float64
 5   times_played  500 non-null    int64  
 6   timestamp     500 non-null    object 
 7   user_id       500 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 31.4+ KB


In [15]:
df_listened.sample(10)

Unnamed: 0,track_id,song_name,artist,genre,year,times_played,timestamp,user_id
69,TREXKDE128F423C5B8,Autosuggestion,Joy Division,"80s, british, new_wave, post_punk",1988.0,1,2025-10-21,U0005
95,TROCCBK128F42A8872,Plunge,Fever Ray,"electronic, experimental, synthpop, idm, dark_...",2017.0,1,2025-10-27,U0005
418,TRDAZEW12903CA93B9,Under the Milky Way,The Church,"alternative, alternative_rock, 80s, soundtrack...",2012.0,2,2025-10-24,U0002
469,TRBPEMX128F42B643C,Mama's Pearl,The Jackson 5,"soul, funk",1971.0,2,2025-10-22,U0003
415,TRLPNFM128F9328881,Wrong Turn,Jack Johnson,"folk, singer_songwriter, soundtrack, acoustic,...",2006.0,3,2025-10-24,U0003
150,TRAAUMJ128F933BDB6,Haunting,Haste the Day,"hardcore, metalcore, screamo",2008.0,1,2025-10-25,U0003
456,TRHNPQY128F4260387,Rekkit,Death in Vegas,"electronic, trance, trip_hop, techno",1997.0,4,2025-10-21,U0003
318,TRGEZFW128F147FD21,We're So Far Away,MAE,"piano, emo",2005.0,2,2025-10-24,U0003
149,TRUEXDR128F934CEE1,Running Free,Iron Maiden,"metal, hard_rock, heavy_metal",2013.0,1,2025-10-21,U0005
342,TRJBHPW128F428B53E,Sexfaldur,Amiina,"ambient, instrumental, post_rock",2007.0,2,2025-10-28,U0002


In [18]:
df_listened["genre"] = df_listened["genre"].str.replace(",", " |")
df_listened.sample(5)

Unnamed: 0,track_id,song_name,artist,genre,year,times_played,timestamp,user_id
69,TREXKDE128F423C5B8,Autosuggestion,Joy Division,80s | british | new_wave | post_punk,1988.0,1,2025-10-21,U0005
393,TRFPLAL128F93453F2,Rock Star,Hole,grunge,2010.0,2,2025-10-21,U0004
398,TRVWDBV12903CEACE0,City Noise,Scarling.,rock | alternative | female_vocalists | altern...,2006.0,3,2025-10-23,U0005
89,TRMDYTH128F933639C,Wrapped Up,Olly Murs,pop | dance | male_vocalists,2015.0,1,2025-10-24,U0001
321,TRMPKHZ128F9300224,Chestnut Mare,The Byrds,rock | classic_rock | folk | 60s | country | o...,2003.0,2,2025-10-22,U0004


In [19]:
df_listened.to_csv("Docs/user_listened_data.csv", index=False)