In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

In [25]:
df = pd.read_csv("Docs/Lastfm.csv", sep=';')

In [26]:
df = df.dropna(subset=["track_id", "song_name", "artist", "genre", "year"])
df = df.drop_duplicates(subset=["track_id"])

In [27]:
df_300 = df.sample(n=300, random_state=42).reset_index(drop=True)

In [28]:
# Selecionar 300 músicas únicas com times_played = 1
df_300["times_played"] = [1 for _ in range(len(df_300))]

# Selecionar 200 músicas diferentes para repetir com times_played entre 2 e 5
df_remaining = df.drop(df_300.index).reset_index(drop=True)
df_repeat = df_remaining.sample(n=200, random_state=1).copy()
df_repeat["times_played"] = [random.randint(2, 5) for _ in range(len(df_repeat))]

# Concatenar tudo
df_listened = pd.concat([df_300, df_repeat]).reset_index(drop=True)


In [29]:
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

In [30]:
start_date = datetime(2025, 10, 21)
end_date = datetime(2025, 10, 28)

df_listened["timestamp"] = [
    random_date(start_date, end_date).date().isoformat() for _ in range(len(df_listened))
]

In [31]:
user_ids = [f"U{str(i).zfill(4)}" for i in range(1, 6)]  # 5 usuários
df_listened["user_id"] = [random.choice(user_ids) for _ in range(len(df_listened))]

In [32]:
df_listened.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   track_id      500 non-null    object 
 1   song_name     500 non-null    object 
 2   artist        500 non-null    object 
 3   genre         500 non-null    object 
 4   year          500 non-null    float64
 5   times_played  500 non-null    int64  
 6   timestamp     500 non-null    object 
 7   user_id       500 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 31.4+ KB


In [35]:
# 1. Converter 'times_played' para número
df_listened["times_played"] = pd.to_numeric(df_listened["times_played"], errors="coerce")

# 2. Converter 'timestamp' para data no formato ISO
df_listened["timestamp"] = pd.to_datetime(df_listened["timestamp"], dayfirst=True).dt.date.astype(str)

# 3. Converter todas as outras colunas para string
for col in df_listened.columns:
    if col not in ["times_played  ", "timestamp"]:
        df_listened[col] = df_listened[col].astype(str)

  df_listened["timestamp"] = pd.to_datetime(df_listened["timestamp"], dayfirst=True).dt.date.astype(str)


In [36]:
df_listened["genre"] = df_listened["genre"].str.replace(",", " |")
df_listened.sample(5)

Unnamed: 0,track_id,song_name,artist,genre,year,times_played,timestamp,user_id
162,TRRJELJ128F429DB3A,Third Stream,4hero,electronic | jazz | drum_and_bass,1998.0,1,2025-10-26,U0005
245,TRGLFSX128F422C071,Vicarious,Tool,rock | alternative | metal | alternative_rock ...,2006.0,1,2025-10-24,U0004
60,TRFEPFQ128EF356BA0,Fly Me To The Moon (In Other Words),Bobby Womack,soul | cover,2005.0,1,2025-10-21,U0004
223,TRAPHGM128F4278969,Ode To Boy,Yazoo,electronic | 80s | new_wave | synthpop,1999.0,1,2025-10-22,U0001
298,TRTUMEF12903CF59C6,Inner Incineration,Napalm Death,death_metal | grindcore,1990.0,1,2025-10-23,U0003


In [37]:
df_listened.to_csv("Docs/user_listened_data.csv", index=False)