In [9]:
import numpy as np
from scipy.sparse.linalg import svds

# Sample User-Item Interaction Matrix
R = np.array([
    [5, 3, 0, 1],
    [4, 0, 0, 1],
    [1, 1, 0, 5],
    [1, 0, 0, 4],
    [0, 1, 5, 4],
])

# Mean normalization
user_ratings_mean = np.mean(R, axis=1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

# Perform SVD
U, sigma, Vt = svds(R_demeaned, k=2)

# Convert sigma to a diagonal matrix
sigma = np.diag(sigma)

# Reconstruct the ratings matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

print("Original User-Item Interaction Matrix:\n", R)
print("\nPredicted Ratings:\n", np.round(predicted_ratings, 2))


Original User-Item Interaction Matrix:
 [[5 3 0 1]
 [4 0 0 1]
 [1 1 0 5]
 [1 0 0 4]
 [0 1 5 4]]

Predicted Ratings:
 [[ 5.1   2.82  0.09  0.99]
 [ 3.24  1.35 -0.71  1.11]
 [ 1.3   0.47  0.28  4.96]
 [ 0.92  0.15 -0.08  4.01]
 [-0.45  1.81  4.58  4.07]]


In [10]:
import pandas as pd

# Specify the path to your CSV file
csv_file_path = 'steam-200k.csv'  # Change this to your file path

# Read the CSV file into a DataFrame
data = pd.read_csv(csv_file_path)
data.columns = ['ID', 'Game', 'Stat', 'Play Time', 'N']
data.drop(columns=['N'], inplace=True)
# Display the first few rows of the DataFrame
all_IDS = data["ID"].unique()
all_games = data["Game"].unique()

print(len(all_games))


single_player = data[data.iloc[:, 0] == all_IDS[0]]
print(single_player)



5155
           ID                                     Game      Stat  Play Time
0   151603712               The Elder Scrolls V Skyrim      play      273.0
1   151603712                                Fallout 4  purchase        1.0
2   151603712                                Fallout 4      play       87.0
3   151603712                                    Spore  purchase        1.0
4   151603712                                    Spore      play       14.9
..        ...                                      ...       ...        ...
60  151603712             HuniePop Original Soundtrack  purchase        1.0
61  151603712            The Banner Saga - Mod Content  purchase        1.0
62  151603712   The Elder Scrolls V Skyrim - Dawnguard  purchase        1.0
63  151603712  The Elder Scrolls V Skyrim - Dragonborn  purchase        1.0
64  151603712  The Elder Scrolls V Skyrim - Hearthfire  purchase        1.0

[65 rows x 4 columns]


We are going to filter out this different purchases and non purchases we will only look at the play time

In [11]:
without_purchase = data[data.iloc[:, 2] == "play"]
with_purchase = data[data.iloc[:, 2] == "purchase"]
print(with_purchase)

               ID                        Game      Stat  Play Time
1       151603712                   Fallout 4  purchase        1.0
3       151603712                       Spore  purchase        1.0
5       151603712           Fallout New Vegas  purchase        1.0
7       151603712               Left 4 Dead 2  purchase        1.0
9       151603712                    HuniePop  purchase        1.0
...           ...                         ...       ...        ...
199989  128470551                Fallen Earth  purchase        1.0
199991  128470551                 Magic Duels  purchase        1.0
199993  128470551                 Titan Souls  purchase        1.0
199995  128470551  Grand Theft Auto Vice City  purchase        1.0
199997  128470551                        RUSH  purchase        1.0

[129510 rows x 4 columns]


In [12]:
# Define the shape
shape = (len(all_IDS), len(all_games)) 
k = 0
# Create a 2D array filled with k
array_2d = np.full(shape, k)
print(array_2d)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [13]:

top_ids = all_IDS[:len(all_IDS)//2]
top_bottom = all_IDS[len(all_IDS)//2:]


In [14]:
import time

In [15]:

def is_empty_play_time(series):
    return (series.empty and
            series.name == 'Play Time' and
            series.dtype == 'float64')
alpha = []
beta = []

'''start_time = time.time()
all_games1 = all_games
all_IDS1 = all_IDS[:2]

for id_index, id in enumerate(all_IDS1):
    for game_index, game in enumerate(all_games1):
        x = without_purchase[(without_purchase.iloc[:, 0] == id) & (without_purchase['Game'] == game)]['Play Time']
        if not is_empty_play_time(x):
            array_2d[id_index][game_index] = float(x)
        else:
            array_2d[id_index][game_index] = 0
end_time = time.time()

diff = end_time - start_time
beta.append(diff)
print(diff)
print(array_2d)
'''





"start_time = time.time()\nall_games1 = all_games\nall_IDS1 = all_IDS[:2]\n\nfor id_index, id in enumerate(all_IDS1):\n    for game_index, game in enumerate(all_games1):\n        x = without_purchase[(without_purchase.iloc[:, 0] == id) & (without_purchase['Game'] == game)]['Play Time']\n        if not is_empty_play_time(x):\n            array_2d[id_index][game_index] = float(x)\n        else:\n            array_2d[id_index][game_index] = 0\nend_time = time.time()\n\ndiff = end_time - start_time\nbeta.append(diff)\nprint(diff)\nprint(array_2d)\n"

In [16]:
"""

def process_play_times(without_purchase, all_IDS, all_games):
    # Create ID and Game lookup dictionaries for faster indexing
    id_to_index = {id_: idx for idx, id_ in enumerate(all_IDS)}
    game_to_index = {game: idx for idx, game in enumerate(all_games)}
    
    # Initialize array with zeros
    array_2d = np.zeros((len(all_IDS), len(all_games)))
    
    # Get the ID column name
    id_col = without_purchase.columns[0]  # Assuming ID is first column
    
    # Instead of pivot, use groupby and unstack to handle duplicates
    # We'll take the sum of play times for duplicate entries
    pivot_df = (without_purchase
                .groupby([id_col, 'Game'])['Play Time']
                .sum()
                .unstack(fill_value=0))
    
    # Transfer values to array_2d using the lookup dictionaries
    for id_ in pivot_df.index:
        if id_ in id_to_index:
            i = id_to_index[id_]
            for game in pivot_df.columns:
                if game in game_to_index:
                    j = game_to_index[game]
                    array_2d[i, j] = pivot_df.loc[id_, game]
    
    return array_2d





start_time = time.time()
array_2d2 = process_play_times(with_purchase, all_IDS, all_games)
end_time = time.time()
diff = end_time - start_time

print(diff)
"""

"\n\ndef process_play_times(without_purchase, all_IDS, all_games):\n    # Create ID and Game lookup dictionaries for faster indexing\n    id_to_index = {id_: idx for idx, id_ in enumerate(all_IDS)}\n    game_to_index = {game: idx for idx, game in enumerate(all_games)}\n    \n    # Initialize array with zeros\n    array_2d = np.zeros((len(all_IDS), len(all_games)))\n    \n    # Get the ID column name\n    id_col = without_purchase.columns[0]  # Assuming ID is first column\n    \n    # Instead of pivot, use groupby and unstack to handle duplicates\n    # We'll take the sum of play times for duplicate entries\n    pivot_df = (without_purchase\n                .groupby([id_col, 'Game'])['Play Time']\n                .sum()\n                .unstack(fill_value=0))\n    \n    # Transfer values to array_2d using the lookup dictionaries\n    for id_ in pivot_df.index:\n        if id_ in id_to_index:\n            i = id_to_index[id_]\n            for game in pivot_df.columns:\n                i

In [17]:
import numpy as np

def process_play_times(without_purchase, all_IDS, all_games):
    # Create ID and Game lookup dictionaries for fast indexing
    id_to_index = {id_: idx for idx, id_ in enumerate(all_IDS)}
    game_to_index = {game: idx for idx, game in enumerate(all_games)}
    
    # Initialize array with zeros
    array_2d = np.zeros((len(all_IDS), len(all_games)))
    
    # Get the ID column name (assuming ID is first column)
    id_col = without_purchase.columns[0]
    
    # Group by ID and Game and sum the 'Play Time'
    grouped = (without_purchase
               .groupby([id_col, 'Game'])['Play Time']
               .sum()
               .reset_index())
    
    # Map the 'ID' and 'Game' columns to indices
    id_indices = grouped[id_col].map(id_to_index).values
    game_indices = grouped['Game'].map(game_to_index).values
    play_times = grouped['Play Time'].values
    
    # Use numpy's advanced indexing to assign values to the 2D array
    array_2d[id_indices, game_indices] = play_times
    
    return array_2d

start_time = time.time()
array_2d2 = process_play_times(with_purchase, all_IDS, all_games)
end_time = time.time()
diff = end_time - start_time

print(diff)


0.1621875762939453


In [18]:
print(array_2d2)
print(array_2d2.shape)

[[0. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(12393, 5155)


In [19]:
np.savetxt('output_array.csv', array_2d2, delimiter=',', fmt='%d')