In [6]:
import torch
import pickle
import numpy as np
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder 
from sklearn.metrics import accuracy_score

import json

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

ModuleNotFoundError: No module named 'torch'

In [2]:
business_file = "yelp_dataset/yelp_academic_dataset_business.json"

categories = {}

# Get busines_id and categories
with open(business_file, 'r', encoding='utf-8') as f:
    for line in f:
        # Load JSON data
        business = json.loads(line)
        
        if business['categories'] is not None:
            categories[business['business_id']] = business['categories']

In [3]:
reviews_file = "yelp_dataset/yelp_academic_dataset_review.json"

reviews = {}

# Get business_id and reviews
with open(reviews_file, 'r', encoding='utf-8') as f:
    for line in f:
        # Load JSON data
        review = json.loads(line)
        
        reviews[review['business_id']] = review['text'] # normalizing

In [4]:
pred_dict = {}

for biz_id, cats in categories.items():
    for c in cats.split(','):
        if c not in pred_dict.keys():
            pred_dict[c] = []
        pred_dict[c].append(reviews[biz_id])

In [17]:
# pred_dict['Bubble Tea']

In [8]:
# embeddings = model.encode(pred_dict['Bubble Tea'])
X = []
Y = []

for x, y in pred_dict.items():
    X.append(model.encode(y))
    Y.append(x)

# #Print the embeddings
# for sentence, embedding in zip(pred_dict['Bubble Tea'], embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")

In [3]:
if False:
    with open('Sentences.pkl', 'wb') as f:
        pickle.dump(X, f)
    
    with open('Labels.pkl', 'wb') as r:
        pickle.dump(Y, r)

if True:
    with open('Sentences.pkl', 'rb') as f:
        X = pickle.load(f)
    with open('Labels.pkl', 'rb') as r:
        Y = pickle.load(r)

In [5]:
place_holder = zip(X, Y)
X_updated = []
Y_updated = []

for sentences, label in place_holder:
    for sentence in sentences:
        s = np.array(sentence)
        X_updated.append(s)
        Y_updated.append(label)



In [6]:
features = np.array(X_updated)
features.shape

(668592, 384)

In [7]:
labels = np.array(Y_updated)
labels.shape

(668592,)

In [143]:
labels.shape

(668592, 2454)

In [8]:
enc = OneHotEncoder()

In [9]:
enc.fit(labels.reshape(-1, 1))
labels = enc.transform(labels.reshape(-1, 1))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [142]:
y_train

<534873x2454 sparse matrix of type '<class 'numpy.float64'>'
	with 534873 stored elements in Compressed Sparse Row format>

In [11]:
X_train = torch.tensor(X_train, dtype=torch.float32).to(torch.int64)
X_test = torch.tensor(X_test, dtype=torch.float32).to(torch.int64)
y_train = torch.tensor(y_train.todense(), dtype=torch.long)
y_test = torch.tensor(y_test.todense(), dtype=torch.long)

In [12]:
class CategoryPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CategoryPredictor, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 128)
        self.fc2 = nn.Linear(128, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)  # Average pooling over the sequence length
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [13]:
class run_model:
    def __init__(self, input_size, hidden_size, output_size):
        self.model = CategoryPredictor(input_size, hidden_size, output_size)
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        

    def run(self, epochs, X_train_tfidf, y_train):
        num_epochs = epochs
        for epoch in range(num_epochs):
            outputs = self.model(X_train_tfidf)
            loss = self.criterion(outputs, y_train)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        print("Done")

    def evaluate(self, X_test_tfidf, y_test):
        with torch.no_grad():
            self.model.eval()
            predictions = self.model(X_test_tfidf)
            _, predicted_labels = torch.max(predictions, 1)
            return accuracy_score(y_test, predicted_labels.numpy())

In [None]:
input_size = X_train.shape[0]
hidden_size = 50 
output_size = labels.shape[1]

model = run_model(input_size, hidden_size, output_size)
model.run(10, X_train, y_train)
accuracy = model.evaluate(X_test, y_test)