In [47]:
import data_functions

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [41]:
# Load data from a JSON Lines (JSONL) file
filtered_train_data = data_functions.load_jsonl('data/restaurant_filtered_train.jsonl')
filtered_test_data = data_functions.load_jsonl('data/restaurant_filtered_test.jsonl')

In [45]:
# Changing data so each data entry has only one aspect and sentiment
# simplest method

simplified_train_data = []
for d in filtered_train_data:
    for a in d['aspects']:
        simplified_train_data.append({'text': d['text'], 'aspect':a['aspect'], 'polarity':a['polarity']})


simplified_test_data = []
for d in filtered_test_data:
    for a in d['aspects']:
        simplified_test_data.append({'text': d['text'], 'aspect':a['aspect'], 'polarity':a['polarity']})

In [46]:
print(len(simplified_train_data), len(simplified_test_data))

3472 954


In [48]:
simplified_train_data[0]

{'text': 'But the staff was so horrible to us.',
 'aspect': 'service',
 'polarity': 'negative'}

In [49]:
train_texts = [d['text'] for d in simplified_train_data]
test_texts = [d['text'] for d in simplified_test_data]

In [57]:
# Vectorize the text data using TF-IDF representation
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

## Aspect classification

In [62]:
train_aspects = [d['aspect'] for d in simplified_train_data]
test_aspects = [d['aspect'] for d in simplified_test_data]

In [63]:
set(train_aspects)

{'ambience', 'food', 'other', 'price', 'service'}

In [64]:
aspect_mapping = {'ambience':0, 'food':1, 'other':2, 'price':3, 'service':4}

train_aspects = np.array([aspect_mapping[aspect] for aspect in train_aspects])
test_aspects = np.array([aspect_mapping[aspect] for aspect in test_aspects])

In [65]:
set(train_aspects)

{0, 1, 2, 3, 4}

In [66]:
# Train an SVM for aspect extraction
aspect_svm = SVC(kernel='linear', C=1.0)
aspect_svm.fit(X_train, train_aspects)

In [81]:
# Predict aspect classification on the test set
aspect_pred = aspect_svm.predict(X_test)

In [82]:
# Evaluate aspect extraction performance
print("Aspect Extraction Classification Report:")
print(classification_report(test_aspects, aspect_pred))

Aspect Extraction Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.44      0.52       103
           1       0.73      0.77      0.75       395
           2       0.71      0.79      0.75       216
           3       0.57      0.36      0.44        77
           4       0.60      0.66      0.63       163

    accuracy                           0.69       954
   macro avg       0.65      0.60      0.62       954
weighted avg       0.68      0.69      0.68       954



## Sentiment classification

In [74]:
train_sentiments = [d['polarity'] for d in simplified_train_data]
test_sentiments = [d['polarity'] for d in simplified_test_data]

In [75]:
set(train_sentiments)

{'negative', 'neutral', 'positive'}

In [77]:
sentiment_mapping ={'negative':0, 'neutral':1, 'positive':2}

train_sentiments = np.array([sentiment_mapping[sentiment] for sentiment in train_sentiments])
test_sentiments = np.array([sentiment_mapping[sentiment] for sentiment in test_sentiments])

In [78]:
set(train_sentiments)

{0, 1, 2}

In [79]:
# Train another SVM for sentiment classification
sentiment_svm = SVC(kernel='linear', C=1.0)
sentiment_svm.fit(X_train, train_sentiments)

In [80]:
# Predict aspect sentiments on the test set
sentiment_pred = sentiment_svm.predict(X_test)

In [83]:
print("Aspect Extraction Classification Report:")
print(classification_report(test_sentiments, sentiment_pred))

Aspect Extraction Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.61      0.66       215
           1       0.55      0.31      0.39        94
           2       0.83      0.92      0.87       645

    accuracy                           0.79       954
   macro avg       0.70      0.61      0.64       954
weighted avg       0.77      0.79      0.78       954

