In [11]:
from faker import Faker
import pandas as pd
import numpy as np

In [9]:
# Initialize Faker
fake = Faker()

# Define the number of users
num_users = 2000

# Generate user data
user_data = {
    'user_id': [fake.unique.uuid4() for _ in range(num_users)],
    'first_name': [fake.first_name_female() if i % 2 == 0 else fake.first_name_male() for i in range(num_users)],
    'last_name': [fake.last_name() for _ in range(num_users)],
    'age': [fake.random_int(min=16, max=78) for _ in range(num_users)],
    'sex': ['F' if i % 2 == 0 else 'M' for i in range(num_users)],
    # Placeholder for portrait_graph filenames
    'portrait_graph': [f'user_{i+1}.png' for i in range(num_users)]
}

# Create a DataFrame
users_df = pd.DataFrame(user_data)

# Save the DataFrame to a CSV file
users_df.to_csv('./data/users.csv', index=False)

In [10]:
users = pd.read_csv('./data/users.csv')
users.head()

Unnamed: 0,user_id,first_name,last_name,age,sex,portrait_graph
0,2dc0cb6c-d4f7-4a95-a9b2-daede98e819a,Nichole,Wolf,47,F,user_1.png
1,0d1905e1-91b7-4a27-b804-9c975c105ce3,Jason,Golden,29,M,user_2.png
2,835564e4-eada-4e91-879f-7eadc24f5bb4,Katherine,Snyder,27,F,user_3.png
3,b136d87d-8d51-4a9d-b655-29fb3bc816b8,Robert,Marsh,48,M,user_4.png
4,19678c5c-9d86-4c9e-a0e4-4fa173a8bb5a,Tonya,Hart,58,F,user_5.png


In [13]:
spotify_data_df = pd.read_csv('./data/spotify_data.csv')

In [19]:
# Define number of user-song relations
num_relations = 10000

# Assuming each user listens to multiple songs and each song has on average 5 listeners
num_songs = num_relations // 5

# Select a subset of songs
song_ids = spotify_data_df['song_id'].sample(n=num_songs, replace=False).tolist()
song_ids *= 5  # Each song appears 5 times

# Shuffle the song_ids to randomize distribution
np.random.shuffle(song_ids)

# Generate listening counts, ensuring that 50% of them are greater than 10
listening_counts = np.concatenate([np.random.randint(1, 11, num_relations // 2),
                                   np.random.randint(11, 500, num_relations // 2)])

# Shuffle the listening_counts to randomize distribution
np.random.shuffle(listening_counts)

# Select user_ids, allowing repetition since a user can listen to multiple songs
user_ids = np.random.choice(users['user_id'], size=num_relations, replace=True)

# Create the user-songs DataFrame
user_songs_df = pd.DataFrame({
    'user_id': user_ids,
    'song_id': song_ids,
    'listening_count': listening_counts
})

# Save the DataFrame to a CSV file
user_songs_df.to_csv('./data/user_songs.csv', index=False)

# The file 'user_songs.csv' is now saved with the dummy data

In [20]:
user_songs = pd.read_csv('./data/user_songs.csv')
user_songs.head()

Unnamed: 0,user_id,song_id,listening_count
0,baefc413-5ca5-40d4-b85e-000e2e80a003,0h083GWZ2W5scJM6JgwETQ,3
1,4da237ee-f206-4a74-866b-04c3c19ecef0,3QWlqqpc4EOm52xipzSehQ,3
2,9f10a3cb-6cd2-4d0a-8318-8f7eba13ec05,174BJkBMuLak3wSvYHASpo,9
3,b71a3b34-0f5f-4018-af95-95ccdb1a1bf9,0LCgPftb1BoxaAmKe8RwY8,1
4,5a5631d5-fdd6-461c-9f06-72de9fb76062,2WinyCfKidqagBGRVPtsTR,27


In [21]:
user_songs['listening_count'].max()

499

In [25]:
# Load the user IDs from users.csv
user_ids = users['user_id'].tolist()

# Initialize a list to hold the user-friend pairs
user_friend_pairs = []

# Set the minimum and maximum number of friends per user
min_friends = 3
max_friends = 20

# Generate the user-friend relationships ensuring the constraints are met
for user_id in user_ids:
    # Randomly determine the number of friends for this user within the range
    num_friends = np.random.randint(min_friends, max_friends + 1)
    
    # Randomly select the friends
    friends = np.random.choice([uid for uid in user_ids if uid != user_id], size=num_friends, replace=False)
    
    # Add user-friend pairs to the list, avoiding self-friendship
    for friend_id in friends:
        user_friend_pairs.append((user_id, friend_id))

        
# Since the relationships are bidirectional, we need to create the reverse pairs
reverse_pairs = [(b, a) for (a, b) in user_friend_pairs]

# Combine both lists and create a DataFrame
all_user_friend_pairs = user_friend_pairs + reverse_pairs

# Convert the list of tuples into a DataFrame
user_friends_df = pd.DataFrame(all_user_friend_pairs, columns=['user_id', 'friend_id'])

# Remove any duplicate pairs that might have occurred
user_friends_df = user_friends_df.drop_duplicates().reset_index(drop=True)

# If we have more pairs than needed, we sample the DataFrame
if len(user_friends_df) > 24000:
    user_friends_df = user_friends_df.sample(n=24000, random_state=1).reset_index(drop=True)

# Save the DataFrame to a CSV file
user_friends_df.to_csv('./data/user_friends.csv', index=False)


In [26]:
user_friends_df.head()

Unnamed: 0,user_id,friend_id
0,dff6f67f-6de5-4a0a-85cc-47b7c0d76cf3,07a9ecf4-e76a-46a7-8901-56f2cf142cdc
1,19c210f8-693d-4824-a9e2-7d9c6494b0a0,325a00b8-db7e-49e1-9a40-caf503cf99c5
2,5a5631d5-fdd6-461c-9f06-72de9fb76062,6ddbd57a-cb18-4c1a-9dd7-1569c51621f8
3,79714cee-9768-4721-999a-0444b7db6705,1a855f68-9edd-4523-87ee-beeb27a7274c
4,93dd8a29-5472-46fe-8fd8-28c4ffb4ea51,32e38ea8-4052-49ba-afd9-cc22db17178a
