In [64]:
import torch
import pickle
import numpy as np
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder 
from sklearn.metrics import accuracy_score, zero_one_loss
from torch.utils.data import DataLoader, TensorDataset

import json

from sentence_transformers import SentenceTransformer
transformer = SentenceTransformer('all-MiniLM-L6-v2')

In [27]:
business_file = "yelp_dataset/yelp_academic_dataset_business.json"

categories = {}

# Get busines_id and categories
with open(business_file, 'r', encoding='utf-8') as f:
    for line in f:
        # Load JSON data
        business = json.loads(line)
        
        if business['categories'] is not None:
            categories[business['business_id']] = business['categories']

In [28]:
reviews_file = "yelp_dataset/yelp_academic_dataset_review.json"

reviews = {}

# Get business_id and reviews
with open(reviews_file, 'r', encoding='utf-8') as f:
    for line in f:
        # Load JSON data
        review = json.loads(line)
        
        reviews[review['business_id']] = review['text'] # normalizing

In [29]:
pred_dict = {}

for biz_id, cats in categories.items():
    for c in cats.split(','):
        if c not in pred_dict.keys():
            pred_dict[c] = []
        pred_dict[c].append(reviews[biz_id])

In [30]:
for x, reviews in pred_dict.items():
    lst = []
    for review in reviews:
        for y in review.split('.'):
            if '!' in y:
                for xm in y.split('!'):
                    lst.append(xm.strip())
                continue
            lst.append(y.strip())
        pred_dict[x] = lst

In [31]:
len(pred_dict)

2454

Decrease the reviews per category by 50%, if one left, delete category. Also decrease categories by 50%

In [32]:
c = int(len(pred_dict.keys()) * 0.5)
to_delete = list(pred_dict.keys())[0: c]

for k in to_delete:
    pred_dict.pop(k)

In [33]:
len(pred_dict)

1227

In [34]:
# temp = pred_dict.copy();
# for category, reviews in temp.items():
#     n = len(reviews)
#     if n== 3:
#         pred_dict.pop(category)
#     pred_dict[category] = reviews[0: int(n/2)]

In [37]:
# embeddings = model.encode(pred_dict['Bubble Tea'])
X = []
Y = []

for x, y in pred_dict.items():
    X.append(model.encode(y))
    Y.append(x)

In [38]:

# #Print the embeddings
# for sentence, embedding in zip(pred_dict['Bubble Tea'], embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")

The data has already been processed into a pkl file for ease:

In [39]:
if True:
    with open('Sentences.pkl', 'wb') as f:
        pickle.dump(X, f)
    
    with open('Labels.pkl', 'wb') as r:
        pickle.dump(Y, r)

if True:
    with open('Sentences.pkl', 'rb') as f:
        X = pickle.load(f)
    with open('Labels.pkl', 'rb') as r:
        Y = pickle.load(r)

In [40]:
place_holder = zip(X, Y)
X_updated = []
Y_updated = []

for sentences, label in place_holder:
    for sentence in sentences:
        s = np.array(sentence)
        X_updated.append(s)
        Y_updated.append(label)

In [41]:
features = np.array(X_updated)
features.shape

(234553, 384)

In [47]:
labels = np.array(Y_updated)
labels.shape

(234553,)

In [48]:
enc = OneHotEncoder()

In [49]:
enc.fit(labels.reshape(-1, 1))
labels = enc.transform(labels.reshape(-1, 1))

In [50]:
labels.shape

(234553, 1227)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [52]:
y_train

<187642x1227 sparse matrix of type '<class 'numpy.float64'>'
	with 187642 stored elements in Compressed Sparse Row format>

In [53]:
X_train = torch.tensor(X_train, dtype=torch.float32).to(torch.int64)
X_test = torch.tensor(X_test, dtype=torch.float32).to(torch.int64)
y_train = torch.tensor(y_train.todense(), dtype=torch.float32)
y_test = torch.tensor(y_test.todense(), dtype=torch.float32)

In [67]:
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True, num_workers=4)

In [68]:
class CategoryPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CategoryPredictor, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 128)
        self.fc2 = nn.Linear(128, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)  # Average pooling over the sequence length
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [69]:
class run_model:
    def __init__(self, input_size, hidden_size, output_size, train_loader, test_loader):
        self.model = CategoryPredictor(input_size, hidden_size, output_size)
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.train_loader = train_loader
        self.test_loader = test_loader


    def run(self, epochs):
        num_epochs = epochs
        for epoch in range(num_epochs):
            print("Start:", epoch)
            for small_x, small_y in self.train_loader:
                self.optimizer.zero_grad()
                # print("Start Forward Pass:", epoch)
                outputs = self.model(small_x)
                # print("Finished Forward Pass:", epoch)
                loss = self.criterion(outputs, torch.argmax(small_y, dim=1))
                # print("Starting Backpropagation:", epoch)
                loss.backward()
                # print("Finished Backpropagation:", epoch)
                self.optimizer.step()
            print("Epoch Fin:", epoch)

        print("Done")

    def evaluate(self, X_test_tfidf, y_test):
        with torch.no_grad():
            self.model.eval()
            predictions = self.model(X_test_tfidf)
            _, predicted_labels = torch.max(predictions, 1)
            return zero_one_loss(torch.argmax(y_test, dim=1), predicted_labels.numpy())

In [70]:
input_size = X_train.shape[0]
hidden_size = 50 
output_size = labels.shape[1]

model = run_model(input_size, hidden_size, output_size, train_loader, test_loader)

In [71]:
model.run(5)

Start: 0
Start Forward Pass: 0
Finished Forward Pass: 0
Starting Backpropagation: 0
Finished Backpropagation: 0
Start Forward Pass: 0
Finished Forward Pass: 0
Starting Backpropagation: 0
Finished Backpropagation: 0
Start Forward Pass: 0
Finished Forward Pass: 0
Starting Backpropagation: 0
Finished Backpropagation: 0
Start Forward Pass: 0
Finished Forward Pass: 0
Starting Backpropagation: 0
Finished Backpropagation: 0
Start Forward Pass: 0
Finished Forward Pass: 0
Starting Backpropagation: 0
Finished Backpropagation: 0
Start Forward Pass: 0
Finished Forward Pass: 0
Starting Backpropagation: 0
Finished Backpropagation: 0
Start Forward Pass: 0
Finished Forward Pass: 0
Starting Backpropagation: 0
Finished Backpropagation: 0
Start Forward Pass: 0
Finished Forward Pass: 0
Starting Backpropagation: 0
Finished Backpropagation: 0
Start Forward Pass: 0
Finished Forward Pass: 0
Starting Backpropagation: 0
Finished Backpropagation: 0
Start Forward Pass: 0
Finished Forward Pass: 0
Starting Backprop

In [74]:
error = model.evaluate(X_test, y_test)

In [76]:
1 - error

0.010487945258041864