In [1]:
from google.colab import drive
drive.mount('/content/drive')
path = "drive/MyDrive/Projet/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import zipfile
import ast

#df_interactions_test = pd.read_csv(path + "interactions_test.csv")
df_interactions_train = pd.read_csv(path + "interactions_train.csv.zip")
#df_RAW_interactions = pd.read_csv(path + "RAW_interactions.csv.zip")
#df_RAW_recipes = pd.read_csv(path + "RAW_recipes.csv.zip")
#print(df_RAW_interactions.shape)
#print(df_RAW_recipes.shape)
print(df_interactions_train.shape)


(698901, 6)


In [3]:
df_interactions_train.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [4]:
df_interactions_train

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723
...,...,...,...,...,...,...
698896,926904,457971,2018-12-18,5.0,13681,141067
698897,2002312797,27208,2018-12-18,5.0,14897,99787
698898,1290903,131607,2018-12-18,5.0,11605,76163
698899,226867,363072,2018-12-18,5.0,3604,29101


In [5]:
# On garde les 100 recettes qui ont le plus de vote
df_interactions_train["count"] = df_interactions_train.groupby("recipe_id").transform('count')['user_id']
recipeId = df_interactions_train.drop_duplicates('recipe_id').sort_values(
    'count', ascending=False).iloc[:100]['recipe_id']
df_interactions_train = df_interactions_train[df_interactions_train['recipe_id'].isin(recipeId)].reset_index(drop=True)

# On garde les 100 utilisateurs qui ont le plus voté
df_interactions_train["count"] = df_interactions_train.groupby("user_id").transform('count')['recipe_id']
userId = df_interactions_train.drop_duplicates('user_id').sort_values(
    'count', ascending=False).iloc[:20001]['user_id']
df_interactions_train = df_interactions_train[df_interactions_train['user_id'].isin(userId)].reset_index(drop=True)

In [31]:
testset = df_interactions_train[-10:]#.sample(frac=0.1, replace=False)
trainset = df_interactions_train[:-10]#[~df_interactions_train.index.isin(testset.index)]

In [32]:
testset

Unnamed: 0,user_id,recipe_id,date,rating,u,i,count
35833,374891,37336,2018-10-23,5.0,6273,138882,2
35834,2001346577,27208,2018-10-25,5.0,5202,99787,2
35835,2001402443,34382,2018-10-29,5.0,6274,89924,1
35836,377929,67256,2018-11-02,5.0,3876,55772,6
35837,928171,25885,2018-11-03,5.0,14204,56425,6
35838,7173268,32844,2018-11-29,5.0,2714,171328,2
35839,2001158027,46877,2018-12-01,5.0,10601,114402,3
35840,315055,69173,2018-12-09,5.0,863,147374,27
35841,1623620,150384,2018-12-12,5.0,9723,6486,5
35842,2002312797,27208,2018-12-18,5.0,14897,99787,1


In [33]:
import torch
from torch.utils.data import Dataset, DataLoader

user_list = df_interactions_train.user_id.unique()
item_list = df_interactions_train.recipe_id.unique()
user2id = {w: i for i, w in enumerate(user_list)}
item2id = {w: i for i, w in enumerate(item_list)}

class Ratings_Datset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index()

    def __len__(self):
        return len(self.df)
  
    def __getitem__(self, idx):
        user = user2id[self.df['user_id'][idx]]
        user = torch.tensor(user, dtype=torch.long)
        item = item2id[self.df['recipe_id'][idx]]
        item = torch.tensor(item, dtype=torch.long)
        rating = torch.tensor(self.df['rating'][idx], dtype=torch.float)
        return user, item, rating


trainloader = DataLoader(Ratings_Datset(trainset), batch_size=512, shuffle=True ,num_workers=2)
testloader = DataLoader(Ratings_Datset(testset), batch_size=64, num_workers=2)

In [34]:
from tqdm.notebook import tqdm
import torch
import torch.nn as nn

class NCF(nn.Module):
        
    def __init__(self, n_users, n_items, n_factors=8):
        super().__init__()
        self.user_embeddings = torch.nn.Embedding(n_users, n_factors)
        self.item_embeddings = torch.nn.Embedding(n_items, n_factors)
        self.predictor = torch.nn.Sequential(
            nn.Linear(in_features=n_factors*2, out_features=64),
            nn.Linear(in_features=64, out_features=32),
            nn.Linear(in_features=32, out_features=1),
            nn.Sigmoid()
        )
        
        
    def forward(self, user, item):
        

        u = self.user_embeddings(user)
        i = self.item_embeddings(item)

        # Concat the two embedding layers
        z = torch.cat([u, i], dim=-1)
        return self.predictor(z)

In [35]:
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from statistics import mean


def train(model, optimizer, trainloader, epochs=30):
    criterion = nn.MSELoss(reduction='mean')
    t = tqdm(range(epochs))
    for epoch in t:
        corrects = 0
        total = 0
        train_loss = []
        for users, items, r in trainloader:
            users = users.cuda()
            items = items.cuda()
            r = r.cuda() / 5
            y_hat = model(users, items)
            loss = criterion(y_hat, r.unsqueeze(1).float())
            train_loss.append(loss.item())
            total += r.size(0)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            t.set_description(f"loss: {mean(train_loss)}")

In [36]:
n_user = df_interactions_train.user_id.nunique()
n_items = df_interactions_train.recipe_id.nunique()
model = NCF(n_user, n_items).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train(model, optimizer, trainloader, epochs=5)

  0%|          | 0/5 [00:00<?, ?it/s]

In [37]:
import math
import tensorflow as tf

def test(model, testloader, m_eval=False):
    running_mae = 0
    with torch.no_grad():
        corrects = 0
        total = 0
        for users, items, r in testloader:
            users = users.cuda()
            items = items.cuda()
            y = r.cuda() / 5
            y_hat = model(users, items).flatten()
            error = torch.abs(y_hat - y).sum().data
            
            running_mae += error
            total += y.size(0)
    
    mae = running_mae/total
    return mae * 5

In [38]:
for key, value in user2id.items():
  if key == 363891:
    print('ok')

In [39]:
# Il faut changer user2id et item2id pour que ca marche pour l'ensemble test
test(model, testloader)

tensor(0.4075, device='cuda:0')

In [42]:
users, recipes, r = next(iter(testloader))
users = users.cuda()
recipes = recipes.cuda()
r = r.cuda()

y = model(users, recipes)*5
print("ratings:", r[:10].data)
print("predictions:", y.flatten()[:10].data)

ratings: tensor([5., 5., 5., 5., 5., 5., 5., 5., 5., 5.], device='cuda:0')
predictions: tensor([4.5999, 4.4783, 4.5551, 4.6662, 4.6632, 4.6742, 4.5569, 4.5954, 4.5774,
        4.5582], device='cuda:0')


In [30]:
torch.save(model.state_dict(),'/content/weights.pth')