In [None]:
import json 
import csv
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
import nltk
from transformers import BertTokenizer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D, Lambda
from sklearn.model_selection import train_test_split
from transformers import TFBertModel
from sqlalchemy import create_engine, Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
filename_pkl = 'restaurants_reviews.pkl'
filename_tskv = 'geo-reviews-dataset-2023.tskv'
filename_json = 'geo-reviews-dataset-2023.json'

In [None]:
def tsv2json(input_file, output_file):
    arr = []
    
    with open(input_file, 'r', encoding='utf-8') as file:
        for line in file:
            items = line.split('\t')
            d = {}
            for item in items:
                key, value = item.split('=', 1)
                d[key.strip()] = value.strip()
            arr.append(d)
    
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(arr, file, ensure_ascii=False, indent=4)

In [None]:
tsv2json(filename_tskv,filename_json)

In [None]:
df = pd.read_json(filename_json, encoding='utf-8')
df_restaurants = df[df['rubrics'].str.contains('Ресторан', case=False)]

In [None]:
df_restaurants
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'[^\w\s]', '', text)  
    text = text.lower()  
    
    return text



In [None]:
df_restaurants['text'] = df_restaurants['text'].apply(clean_text)
df_restaurants.drop_duplicates(subset=['text'], inplace=True)
df_restaurants.to_pickle(filename_pkl)


In [None]:
df_restaurants.reset_index(drop=True, inplace=True)
nltk.download('punkt')
def tokenize_text(text):
    return word_tokenize(text)
df_restaurants['tokens'] = df_restaurants['text'].apply(tokenize_text)
print(df_restaurants[['text', 'tokens']].head())

In [None]:
def rating_to_label(rating):
    if rating <= 2:
        return 0  
    elif rating >= 4:
        return 1  
    else:
        return None 
df_restaurants['label'] = df_restaurants['rating'].apply(rating_to_label)
df_restaurants = df_restaurants.dropna(subset=['label'])
df_restaurants.to_pickle(filename_pkl)

In [None]:
df_restaurants = pd.read_pickle(filename_pkl)
df_restaurants

In [None]:
texts = df_restaurants['text'].tolist()
labels = df_restaurants['label'].tolist()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer(texts, padding=True, truncation=True, max_length=100, return_tensors="tf")
input_ids = tokens['input_ids']
attention_masks = tokens['attention_mask']

input_ids_np = input_ids.numpy()
attention_masks_np = attention_masks.numpy()

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids_np, labels, test_size=0.2, random_state=42)
train_masks, test_masks = train_test_split(
    attention_masks_np, test_size=0.2, random_state=42)

train_inputs = tf.convert_to_tensor(train_inputs)
test_inputs = tf.convert_to_tensor(test_inputs)
train_masks = tf.convert_to_tensor(train_masks)
test_masks = tf.convert_to_tensor(test_masks)
train_labels = tf.convert_to_tensor(train_labels)
test_labels = tf.convert_to_tensor(test_labels)


In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

input_ids = Input(shape=(100,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(100,), dtype=tf.int32, name='attention_mask')

def bert_layer(inputs):
    input_ids, attention_mask = inputs
    return bert_model(input_ids=input_ids, attention_mask=attention_mask)[0]

bert_output = Lambda(bert_layer, output_shape=(100, 768))([input_ids, attention_mask]) 
x = GlobalAveragePooling1D()(bert_output)
x = Dropout(0.3)(x)
output = Dense(1, activation='sigmoid')(x)

In [None]:
model = Model(inputs=[input_ids, attention_mask], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(
    filepath='model_reviews.keras',  
    save_best_only=True,             
    monitor='val_loss',              
    mode='min',                      
    save_weights_only=False,         
    verbose=1                        
)


history = model.fit(
    [train_inputs, train_masks],
    train_labels,
    validation_data=([test_inputs, test_masks], test_labels),
    epochs=50,
    batch_size=16,
    callbacks=[early_stopping, model_checkpoint],

)

In [None]:
model = tf.keras.models.load_model('model_reviews.keras', custom_objects={'bert_layer': bert_layer})

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
loss, accuracy = model.evaluate([test_inputs, test_masks], test_labels)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

In [None]:
engine = create_engine('sqlite:///C:/Users/Alex/Documents/projects/restaurants_reviews/reviews.db')

Base = declarative_base()

class ClassificationResult(Base):
    __tablename__ = 'classification_results'

    id = Column(Integer, primary_key=True)
    review_text = Column(String)
    predicted_rating = Column(Float)

Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()