# Data generation and pre-processing #

In [1]:
%pip install lenskit




In [6]:
from lenskit.datasets import ML1M
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, als, item_knn as knn
from lenskit import topn

In [7]:
import pandas as pd
import tqdm

### Load the desired dataset (ML100K / ML1M / MovieLens)

In [8]:
ml1m = ML1M('ml-1m\ml-1m')
ratings = ml1m.ratings
#ratings.drop(['timestamp'], axis=1, inplace=True)          
ratings.head()

print(len(ratings))

1000209


### Ensure no missing values are present

In [49]:
data = ratings.copy()

unique_id = data.user.unique()
dictID = {}
i = 1
for k in unique_id:
    dictID[k] = i
    i += 1
data['userID'] = data.user.map(dictID)

unique_i_id = data.item.unique()
dictID_i = {}
j = 1
for k in unique_i_id:
    dictID_i[k] = j
    j += 1
data['itemID'] = data.item.map(dictID_i)

data = data [['userID', 'itemID', 'rating', 'user', 'item', 'timestamp']]
del data['user']
del data['item']
print(len(data.userID.unique()))
print(len(data.itemID.unique()))

# user_info = {k: v for k, v in user_info.items() if k in filter_users_list}
data = data.rename(columns={'userID': 'user', 'itemID': 'item'})
data 

610
9724


Unnamed: 0,user,item,rating,timestamp
0,1,1,4.0,964982703
1,1,2,4.0,964981247
2,1,3,4.0,964982224
3,1,4,5.0,964983815
4,1,5,5.0,964982931
...,...,...,...,...
100831,610,3121,4.0,1493848402
100832,610,2036,5.0,1493850091
100833,610,3122,5.0,1494273047
100834,610,1393,5.0,1493846352


In [50]:
print(data.item.max())
data.item.value_counts()

9724


21      329
233     317
17      307
35      279
167     278
       ... 
7973      1
8853      1
4755      1
6802      1
8895      1
Name: item, Length: 9724, dtype: int64

In [51]:
print(data.head())
print("unique movies id's: ",len(data["item"].unique())) 
print("amount of ratings: ",len(data["item"]))
print("amount of users: ",len(data["user"].unique()))
print("Amount of duplicates = ",len(data["item"].unique()) - len(data["item"].unique()))

   user  item  rating  timestamp
0     1     1     4.0  964982703
1     1     2     4.0  964981247
2     1     3     4.0  964982224
3     1     4     5.0  964983815
4     1     5     5.0  964982931
unique movies id's:  9724
amount of ratings:  100836
amount of users:  610
Amount of duplicates =  0


### Splitting the data into test, validation and training sets *at random*

In [11]:
import lenskit.crossfold as xf

print(data)

# Decrease user_id and item_id by 1 in order to start counting from 0
data['user']-=1
data['item']-=1

for i, tp in enumerate(xf.partition_users(data, 1, xf.SampleFrac(0.1))):
    tp.train.to_csv(r'C:\Users\fleur\Thesis B3\RQ0\RobustnessOfMetaMF-master\RobustnessOfMetaMF-master\ThesisData\ml-1m\ml-1m\ml-1m.trainval.rating', index = False)
    trainVal = tp.train
    tp.test.to_csv(r'C:\Users\fleur\Thesis B3\RQ0\RobustnessOfMetaMF-master\RobustnessOfMetaMF-master\ThesisData\ml-1m\ml-1m\ml-1m.test.rating', index = False)
    test = tp.test
    
print(len(test))
print(len(trainVal))

         user  item  rating
0           1     1     5.0
1           1     2     3.0
2           1     3     3.0
3           1     4     4.0
4           1     5     5.0
...       ...   ...     ...
1000204  6040   773     1.0
1000205  6040  1107     5.0
1000206  6040   366     5.0
1000207  6040   153     4.0
1000208  6040    27     4.0

[1000209 rows x 3 columns]
99950
900259


In [12]:
for i, tp in enumerate(xf.partition_users(trainVal, 1, xf.SampleFrac(0.1))):
    tp.train.to_csv(r'C:\Users\fleur\Thesis B3\RQ0\RobustnessOfMetaMF-master\RobustnessOfMetaMF-master\ThesisData\ml-1m\ml-1m\ml-1m.train.rating', index = False)
    train = tp.train
    tp.test.to_csv(r'C:\Users\fleur\Thesis B3\RQ0\RobustnessOfMetaMF-master\RobustnessOfMetaMF-master\ThesisData\ml-1m\ml-1m\ml-1m.valid.rating', index = False)
    val = tp.test
    
print(len(val))
print(len(train))
print(len(train)+len(val))

89976
810283
900259


### Splitting the data into test, validation and training sets *based on time*

In [4]:
import lenskit.crossfold as xf

m = xf.LastFrac(0.1, col='timestamp')
print(len(m))

TypeError: object of type 'LastFrac' has no len()

In [52]:
import lenskit.crossfold as xf

print(data)

# Decrease user_id and item_id by 1
data['user']-=1
data['item']-=1

for i, tp in enumerate(xf.partition_users(data, 1, xf.LastFrac(0.1, col='timestamp'))):
    tp.train.to_csv(r'C:\Users\fleur\Thesis B3\RQ0\RobustnessOfMetaMF-master\RobustnessOfMetaMF-master\ThesisData\ml-latest-small\ml-latest-small_time\ml-latest-small.trainval.rating', index = False)
    trainVal = tp.train
    tp.test.to_csv(r'C:\Users\fleur\Thesis B3\RQ0\RobustnessOfMetaMF-master\RobustnessOfMetaMF-master\ThesisData\ml-latest-small\ml-latest-small_time\ml-latest-small.test.rating', index = False)
    test = tp.test
    
print(len(test))
print(len(trainVal))

        user  item  rating   timestamp
0          1     1     4.0   964982703
1          1     2     4.0   964981247
2          1     3     4.0   964982224
3          1     4     5.0   964983815
4          1     5     5.0   964982931
...      ...   ...     ...         ...
100831   610  3121     4.0  1493848402
100832   610  2036     5.0  1493850091
100833   610  3122     5.0  1494273047
100834   610  1393     5.0  1493846352
100835   610  2874     3.0  1493846415

[100836 rows x 4 columns]
10093
90743


In [53]:
for i, tp in enumerate(xf.partition_users(trainVal, 1, xf.LastFrac(0.1, col='timestamp'))):
    tp.train.to_csv(r'C:\Users\fleur\Thesis B3\RQ0\RobustnessOfMetaMF-master\RobustnessOfMetaMF-master\ThesisData\ml-latest-small\ml-latest-small_time\ml-latest-small.train.rating', index = False)
    train = tp.train
    tp.test.to_csv(r'C:\Users\fleur\Thesis B3\RQ0\RobustnessOfMetaMF-master\RobustnessOfMetaMF-master\ThesisData\ml-latest-small\ml-latest-small_time\ml-latest-small.valid.rating', index = False)
    val = tp.test
    
print(len(val))
print(len(train))
print(len(train)+len(val))

9071
81672
90743


#### ^ After this step, delete the item,user,rating line from the data manually

### Making an itemlist 

In [54]:
items = data.copy()
print(len(items))

items.drop(['user', 'rating', 'timestamp'], axis=1, inplace=True)
sort = items.sort_values(items.columns[0], ascending = True)
itemlist = sort.drop_duplicates(subset = ['item'], keep = 'first')

print(len(itemlist))

itemlist.to_csv(r'C:\Users\fleur\Thesis B3\RQ0\RobustnessOfMetaMF-master\RobustnessOfMetaMF-master\ThesisData\ml-latest-small\ml-latest-small_time\ml-latest-small.itemlist', header=None, index=None, sep=' ', mode='w')

100836
9724


### Making a userlist

In [55]:
users = data.copy()
print(len(users))

users.drop(['item', 'rating', 'timestamp'], axis=1, inplace=True)
sort = users.sort_values(users.columns[0], ascending = True)
userlist = sort.drop_duplicates(subset = ['user'], keep = 'first')

print(len(userlist))

userlist.to_csv(r'C:\Users\fleur\Thesis B3\RQ0\RobustnessOfMetaMF-master\RobustnessOfMetaMF-master\ThesisData\ml-latest-small\ml-latest-small_time\ml-latest-small.userlist', header=None, index=None, sep=' ', mode='w')

100836
610
