In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read Data

In [2]:
mlhundredk_train_df = pd.read_csv('ThesisData/ml-100k/ml-100k_time/ml-100k.train.rating', sep = ',', header=None, names=["user_id", "item_id", "rating", "timestamp"])
mlhundredk_val_df = pd.read_csv('ThesisData/ml-100k/ml-100k_time/ml-100k.valid.rating', sep = ',', header=None, names=["user_id", "item_id", "rating", "timestamp"])
mlhundredk_test_df = pd.read_csv('ThesisData/ml-100k/ml-100k_time/ml-100k.test.rating', sep = ',', header=None, names=["user_id", "item_id", "rating", "timestamp"])
mlhundredk_df = mlhundredk_train_df.append(mlhundredk_val_df).append(mlhundredk_test_df)

mlonem_train_df = pd.read_csv('ThesisData/ml-1m/ml-1m_time/ml-1m.train.rating', sep = ',', header=None, names=["user_id", "item_id", "rating", "timestamp"])
mlonem_val_df = pd.read_csv('ThesisData/ml-1m/ml-1m_time/ml-1m.valid.rating', sep = ',', header=None, names=["user_id", "item_id", "rating", "timestamp"])
mlonem_test_df = pd.read_csv('ThesisData/ml-1m/ml-1m_time/ml-1m.test.rating', sep = ',', header=None, names=["user_id", "item_id", "rating", "timestamp"])
mlonem_df = mlonem_train_df.append(mlonem_val_df).append(mlonem_test_df)

mllatsmall_train_df = pd.read_csv('ThesisData/ml-latest-small/ml-latest-small_time/ml-latest-small.train.rating', sep = ',', header=None, names=["user_id", "item_id", "rating", "timestamp"])
mllatsmall_val_df = pd.read_csv('ThesisData/ml-latest-small/ml-latest-small_time/ml-latest-small.valid.rating', sep = ',', header=None, names=["user_id", "item_id", "rating", "timestamp"])
mllatsmall_test_df = pd.read_csv('ThesisData/ml-latest-small/ml-latest-small_time/ml-latest-small.test.rating', sep = ',', header=None, names=["user_id", "item_id", "rating", "timestamp"])
mllatsmall_df = mllatsmall_train_df.append(mllatsmall_val_df).append(mllatsmall_test_df)

# Identify User Groups
We identify three user groups: Users with the smallest amount of rating data ($Low$), users with most rating data ($High$) and users, whose amount of ratings is around the median ($Med$). These three user groups are constructed for five different datasets: Douban, Hetrec-MovieLens, MovieLens 1M, Ciao and Jester.
## Douban

In [43]:
user_popularity = db_df.groupby("user_id").size().sort_values(ascending=True)
n_5p_users = np.round(db_df["user_id"].nunique() * 0.05).astype(int) #change to 0.10
low_users = user_popularity[:n_5p_users].index.tolist()
med_users = np.abs(user_popularity - user_popularity.median()).sort_values(ascending=True)[:n_5p_users].index.tolist()
high_users = user_popularity[-n_5p_users:].index.tolist()
print("Constructed low (%d users), med (%d users), high (%d users) usergroups." % (len(low_users), len(med_users), len(high_users)))

pd.DataFrame(low_users).to_csv("data/User Groups/db_low.userlist", header=None, index=False)
pd.DataFrame(med_users).to_csv("data/User Groups/db_med.userlist", header=None, index=False)
pd.DataFrame(high_users).to_csv("data/User Groups/db_high.userlist", header=None, index=False)

df = db_df[db_df["user_id"].isin(low_users)].append(db_df[db_df["user_id"].isin(med_users)]).append(db_df[db_df["user_id"].isin(high_users)])
df.to_csv("data/User Groups/db_ratings.txt", index=False)

n_users = df["user_id"].nunique()
n_items = df["item_id"].nunique()
n_ratings = len(df)
print("|U|: %d, |I|: %d, |R|: %d" % (n_users, n_items, n_ratings))

Constructed low (125 users), med (125 users), high (125 users) usergroups.
|U|: 375, |I|: 32191, |R|: 266517


## Hetrec-MovieLens

In [44]:
user_popularity = html_df.groupby("user_id").size().sort_values(ascending=True)
n_5p_users = np.round(html_df["user_id"].nunique() * 0.05).astype(int)
low_users = user_popularity[:n_5p_users].index.tolist()
med_users = np.abs(user_popularity - user_popularity.median()).sort_values(ascending=True)[:n_5p_users].index.tolist()
high_users = user_popularity[-n_5p_users:].index.tolist()
print("Constructed low (%d users), med (%d users), high (%d users) usergroups." % (len(low_users), len(med_users), len(high_users)))

pd.DataFrame(low_users).to_csv("data/User Groups/ht-ml_low.userlist", header=None, index=False)
pd.DataFrame(med_users).to_csv("data/User Groups/ht-ml_med.userlist", header=None, index=False)
pd.DataFrame(high_users).to_csv("data/User Groups/ht-ml_high.userlist", header=None, index=False)
df = html_df[html_df["user_id"].isin(low_users)].append(html_df[html_df["user_id"].isin(med_users)]).append(html_df[html_df["user_id"].isin(high_users)])
df.to_csv("data/User Groups/ht-ml_ratings.txt", index=False)

n_users = df["user_id"].nunique()
n_items = df["item_id"].nunique()
n_ratings = len(df)
print("|U|: %d, |I|: %d, |R|: %d" % (n_users, n_items, n_ratings))

Constructed low (106 users), med (106 users), high (106 users) usergroups.
|U|: 318, |I|: 9553, |R|: 207943


## MovieLens 1M

In [45]:
user_popularity = ml_df.groupby("user_id").size().sort_values(ascending=True)
n_5p_users = np.round(ml_df["user_id"].nunique() * 0.05).astype(int)
low_users = user_popularity[:n_5p_users].index.tolist()
med_users = np.abs(user_popularity - user_popularity.median()).sort_values(ascending=True)[:n_5p_users].index.tolist()
high_users = user_popularity[-n_5p_users:].index.tolist()
print("Constructed low (%d users), med (%d users), high (%d users) usergroups." % (len(low_users), len(med_users), len(high_users)))

pd.DataFrame(low_users).to_csv("data/User Groups/ml_low.userlist", header=None, index=False)
pd.DataFrame(med_users).to_csv("data/User Groups/ml_med.userlist", header=None, index=False)
pd.DataFrame(high_users).to_csv("data/User Groups/ml_high.userlist", header=None, index=False)
df = ml_df[ml_df["user_id"].isin(low_users)].append(ml_df[ml_df["user_id"].isin(med_users)]).append(ml_df[ml_df["user_id"].isin(high_users)])
df.to_csv("data/User Groups/ml_ratings.txt", index=False)

n_users = df["user_id"].nunique()
n_items = df["item_id"].nunique()
n_ratings = len(df)
print("|U|: %d, |I|: %d, |R|: %d" % (n_users, n_items, n_ratings))

Constructed low (302 users), med (302 users), high (302 users) usergroups.
|U|: 906, |I|: 3613, |R|: 275119


## Ciao

In [46]:
user_popularity = ciao_df.groupby("user_id").size().sort_values(ascending=True)
n_5p_users = np.round(ciao_df["user_id"].nunique() * 0.05).astype(int)
low_users = user_popularity[:n_5p_users].index.tolist()
med_users = np.abs(user_popularity - user_popularity.median()).sort_values(ascending=True)[:n_5p_users].index.tolist()
high_users = user_popularity[-n_5p_users:].index.tolist()
print("Constructed low (%d users), med (%d users), high (%d users) usergroups." % (len(low_users), len(med_users), len(high_users)))

pd.DataFrame(low_users).to_csv("data/User Groups/ciao_low.userlist", header=None, index=False)
pd.DataFrame(med_users).to_csv("data/User Groups/ciao_med.userlist", header=None, index=False)
pd.DataFrame(high_users).to_csv("data/User Groups/ciao_high.userlist", header=None, index=False)
df = ciao_df[ciao_df["user_id"].isin(low_users)].append(ciao_df[ciao_df["user_id"].isin(med_users)]).append(ciao_df[ciao_df["user_id"].isin(high_users)])
df.to_csv("data/User Groups/ciao_ratings.txt", index=False)

n_users = df["user_id"].nunique()
n_items = df["item_id"].nunique()
n_ratings = len(df)
print("|U|: %d, |I|: %d, |R|: %d" % (n_users, n_items, n_ratings))

Constructed low (369 users), med (369 users), high (369 users) usergroups.
|U|: 1107, |I|: 60132, |R|: 107807


## Jester

In [47]:
user_popularity = jester_df.groupby("user_id").size().sort_values(ascending=True)
n_5p_users = np.round(jester_df["user_id"].nunique() * 0.05).astype(int)
low_users = user_popularity[:n_5p_users].index.tolist()
med_users = np.abs(user_popularity - user_popularity.median()).sort_values(ascending=True)[:n_5p_users].index.tolist()
high_users = user_popularity[-n_5p_users:].index.tolist()
print("Constructed low (%d users), med (%d users), high (%d users) usergroups." % (len(low_users), len(med_users), len(high_users)))

pd.DataFrame(low_users).to_csv("data/User Groups/jester_low.userlist", header=None, index=False)
pd.DataFrame(med_users).to_csv("data/User Groups/jester_med.userlist", header=None, index=False)
pd.DataFrame(high_users).to_csv("data/User Groups/jester_high.userlist", header=None, index=False)
df = jester_df[jester_df["user_id"].isin(low_users)].append(jester_df[jester_df["user_id"].isin(med_users)]).append(jester_df[jester_df["user_id"].isin(high_users)])
df.to_csv("data/User Groups/jester_ratings.txt", index=False)

n_users = df["user_id"].nunique()
n_items = df["item_id"].nunique()
n_ratings = len(df)
print("|U|: %d, |I|: %d, |R|: %d" % (n_users, n_items, n_ratings))

NameError: name 'jester_df' is not defined

### Datasets used in the user profile minimization thesis:

## MovieLens 100K

In [3]:
user_popularity = mlhundredk_df.groupby("user_id").size().sort_values(ascending=True)
n_5p_users = np.round(mlhundredk_df["user_id"].nunique() * 0.15).astype(int)
low_users = user_popularity[:n_5p_users].index.tolist()
med_users = np.abs(user_popularity - user_popularity.median()).sort_values(ascending=True)[:n_5p_users].index.tolist()
high_users = user_popularity[-n_5p_users:].index.tolist()
print("Constructed low (%d users), med (%d users), high (%d users) usergroups." % (len(low_users), len(med_users), len(high_users)))

pd.DataFrame(low_users).to_csv("data/User Groups/time/ml100k_low.userlist", header=None, index=False)
pd.DataFrame(med_users).to_csv("data/User Groups/time/ml100k_med.userlist", header=None, index=False)
pd.DataFrame(high_users).to_csv("data/User Groups/time/ml100k_high.userlist", header=None, index=False)
df = mlhundredk_df[mlhundredk_df["user_id"].isin(low_users)].append(mlhundredk_df[mlhundredk_df["user_id"].isin(med_users)]).append(mlhundredk_df[mlhundredk_df["user_id"].isin(high_users)])
df.to_csv("data/User Groups/ml100k_ratings.txt", index=False)

n_users = df["user_id"].nunique()
n_items = df["item_id"].nunique()
n_ratings = len(df)
print("|U|: %d, |I|: %d, |R|: %d" % (n_users, n_items, n_ratings))
print(user_popularity)

Constructed low (141 users), med (141 users), high (141 users) usergroups.
|U|: 423, |I|: 1658, |R|: 54525
user_id
380     20
852     20
642     20
258     20
629     20
      ... 
37     518
442    540
58     636
650    685
402    737
Length: 943, dtype: int64


## MovieLens 1M

In [5]:
user_popularity = mlonem_df.groupby("user_id").size().sort_values(ascending=True)
n_5p_users = np.round(mlonem_df["user_id"].nunique() * 0.05).astype(int)
low_users = user_popularity[:n_5p_users].index.tolist()
med_users = np.abs(user_popularity - user_popularity.median()).sort_values(ascending=True)[:n_5p_users].index.tolist()
high_users = user_popularity[-n_5p_users:].index.tolist()
print("Constructed low (%d users), med (%d users), high (%d users) usergroups." % (len(low_users), len(med_users), len(high_users)))

pd.DataFrame(low_users).to_csv("data/User Groups/ml1m_low.userlist", header=None, index=False)
pd.DataFrame(med_users).to_csv("data/User Groups/ml1m_med.userlist", header=None, index=False)
pd.DataFrame(high_users).to_csv("data/User Groups/ml1m_high.userlist", header=None, index=False)
df = mlonem_df[mlonem_df["user_id"].isin(low_users)].append(mlonem_df[mlonem_df["user_id"].isin(med_users)]).append(mlonem_df[mlonem_df["user_id"].isin(high_users)])
df.to_csv("data/User Groups/ml1m_ratings.txt", index=False)

n_users = df["user_id"].nunique()
n_items = df["item_id"].nunique()
n_ratings = len(df)
print("|U|: %d, |I|: %d, |R|: %d" % (n_users, n_items, n_ratings))

Constructed low (302 users), med (302 users), high (302 users) usergroups.
|U|: 906, |I|: 3613, |R|: 275119


## MovieLens Latest Small

In [7]:
user_popularity = mllatsmall_df.groupby("user_id").size().sort_values(ascending=True)
n_5p_users = np.round(mllatsmall_df["user_id"].nunique() * 0.20).astype(int)
low_users = user_popularity[:n_5p_users].index.tolist()
med_users = np.abs(user_popularity - user_popularity.median()).sort_values(ascending=True)[:n_5p_users].index.tolist()
high_users = user_popularity[-n_5p_users:].index.tolist()
print("Constructed low (%d users), med (%d users), high (%d users) usergroups." % (len(low_users), len(med_users), len(high_users)))

pd.DataFrame(low_users).to_csv("data/User Groups/mlls_low.userlist", header=None, index=False)
pd.DataFrame(med_users).to_csv("data/User Groups/mlls_med.userlist", header=None, index=False)
pd.DataFrame(high_users).to_csv("data/User Groups/mlls_high.userlist", header=None, index=False)
df = mllatsmall_df[mllatsmall_df["user_id"].isin(low_users)].append(mllatsmall_df[mllatsmall_df["user_id"].isin(med_users)]).append(mllatsmall_df[mllatsmall_df["user_id"].isin(high_users)])
df.to_csv("data/User Groups/mlls_ratings.txt", index=False)

n_users = df["user_id"].nunique()
n_items = df["item_id"].nunique()
n_ratings = len(df)
print("|U|: %d, |I|: %d, |R|: %d" % (n_users, n_items, n_ratings))

Constructed low (122 users), med (122 users), high (122 users) usergroups.
|U|: 366, |I|: 9348, |R|: 77075
