In [125]:
import torch
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, Linear
from torch_geometric.nn import global_mean_pool
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import torch.nn.functional as F
from sklearn.feature_extraction.text import CountVectorizer

In [126]:
class GNN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.linear = Linear(hidden_channels, num_features)

    def forward(self, x, edge_index):
        x = F.leaky_relu(self.conv1(x, edge_index))
        x = F.leaky_relu(self.conv2(x, edge_index))
        x = self.conv3(x, edge_index)
        x = self.linear(x)
        return x

In [140]:
def preprocess_data(user_likes_df, all_restaurants_df, preferred_min_stars):
    all_data = pd.concat([user_likes_df, all_restaurants_df], ignore_index=True)
    all_data['star_diff'] = all_data['Star'] - preferred_min_stars
    all_data.drop_duplicates()
    price_dict = {'$': 1, '$$': 2, '$$$': 3, '$$$$': 4, 'n/a': 0}
    all_data['Price'] = all_data['Price'].map(price_dict)
    
    le_area = LabelEncoder()
    all_data['Area_encoded'] = le_area.fit_transform(all_data['Area'])
    
    category_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','), lowercase=False, token_pattern=None)
    category_encoded = category_vectorizer.fit_transform(all_data['Category'])
    category_df = pd.DataFrame(category_encoded.toarray(), columns=category_vectorizer.get_feature_names_out())
    
    service_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','), lowercase=False, token_pattern=None)
    service_encoded = service_vectorizer.fit_transform(all_data['Services'])
    service_df = pd.DataFrame(service_encoded.toarray(), columns=service_vectorizer.get_feature_names_out())
    
    scaler = MinMaxScaler()
    all_data[['Star_normalized', 'Price_normalized', 'star_diff_normalized']] = scaler.fit_transform(all_data[['Star', 'Price', 'star_diff']])
    
    feature_df = pd.concat([
        all_data[['Star_normalized', 'Price_normalized', 'star_diff_normalized', 'Area_encoded']],
        category_df,
        service_df
    ], axis=1)
    
    features = feature_df.values
    
    return torch.FloatTensor(features), all_data


In [141]:
def create_graph(features, user_likes_df, all_restaurants_df):
    num_user_likes = len(user_likes_df)
    num_total = len(features)
    
    edge_index = []
    for i in range(num_user_likes):
        for j in range(num_user_likes, num_total):
            edge_index.append([i, j])
            edge_index.append([j, i])
    
    edge_index = torch.LongTensor(edge_index).t().contiguous()
    
    return Data(x=features, edge_index=edge_index)

In [142]:
def train_model(model, graph_data, num_epochs=100):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = torch.nn.MSELoss()

    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        out = model(graph_data.x, graph_data.edge_index)
        loss = criterion(out, graph_data.x)
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

In [151]:
def get_recommendations(model, graph_data, user_likes_df, all_restaurants_df, top_k=100):
    model.eval()
    with torch.no_grad():
        node_embeddings = model(graph_data.x, graph_data.edge_index)
    
    user_embeddings = node_embeddings[:len(user_likes_df)]
    restaurant_embeddings = node_embeddings[len(user_likes_df):]
    
    similarities = torch.mm(user_embeddings.mean(dim=0).unsqueeze(0), restaurant_embeddings.t())
    
    #_, indices = similarities.topk(min(len(all_restaurants_df), similarities.size(1)))
    _, indices = similarities.topk(top_k)
    print(len(indices))
    
    final_indices = []
    seen_restaurants = set(user_likes_df['Name'])
    
    for idx in indices.squeeze():
        restaurant_name = all_restaurants_df.iloc[idx.item()]['Name']
        if restaurant_name not in seen_restaurants:
            final_indices.append(idx.item())
            if len(final_indices) == top_k:
                break
    
    return all_restaurants_df.iloc[final_indices]

In [152]:
preferred_min_stars = 4.0
data_file = 'Restaurants_Seattle.csv'
user_data_file = 'Sample_User.xlsx'

user_data = pd.read_excel(user_data_file)
data = pd.read_csv(data_file)

features, processed_data = preprocess_data(user_data, data, preferred_min_stars)
graph_data = create_graph(features, user_data, processed_data)

model = GNN(num_features=features.shape[1], hidden_channels=64)
train_model(model, graph_data)

recommendations = get_recommendations(model, graph_data, user_data, processed_data, top_k=5)
display(recommendations)


Epoch 10/100, Loss: nan
Epoch 20/100, Loss: nan
Epoch 30/100, Loss: nan
Epoch 40/100, Loss: nan
Epoch 50/100, Loss: nan
Epoch 60/100, Loss: nan
Epoch 70/100, Loss: nan
Epoch 80/100, Loss: nan
Epoch 90/100, Loss: nan
Epoch 100/100, Loss: nan
1


Unnamed: 0,Name,Star,Stars_count,Price,Area,Category,Services,Searched City,star_diff,Area_encoded,Star_normalized,Price_normalized,star_diff_normalized


In [None]:
# add method to generate multiple restaurants quickly on same model