In [1]:
import xml.etree.ElementTree as ET 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import MultiLabelBinarizer 
from sklearn.svm import SVC 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, classification_report
import time

In [2]:
def parse_semeval_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    sentences = []
    labels = []
    
    for sentence_elem in root.findall('sentence'):
        text_elem = sentence_elem.find('text')
        
        if text_elem is not None and text_elem.text and text_elem.text.strip():
            sentences.append(text_elem.text)
        
            aspect_categories = sentence_elem.find('aspectCategories')
            if aspect_categories is not None:
                sentence_labels = [cat.get('category') for cat in aspect_categories.findall('aspectCategory')]
                labels.append(sentence_labels)
            else:
                labels.append([])
        else:
            continue
        
    return sentences, labels

In [3]:
train_path = './datasets/Restaurants_Train.xml'
test_path = './datasets/Restaurants_Test_Data_phaseB.xml'

train_texts, train_labels_str = parse_semeval_xml(train_path)
test_texts, test_labels_str = parse_semeval_xml(test_path)

print(f"Loaded {len(train_texts)} training sentences and {len(test_texts)} test sentences.")
print("Example training sentence:", train_texts[1])
print("Example training labels:", train_labels_str[1])

Loaded 3044 training sentences and 800 test sentences.
Example training sentence: To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora.
Example training labels: ['food', 'anecdotes/miscellaneous']


In [4]:
CATEGORIES = ['food', 'service', 'price', 'ambience', 'anecdotes/miscellaneous']

mlb = MultiLabelBinarizer(classes=CATEGORIES)

train_labels = mlb.fit_transform(train_labels_str)
test_labels = mlb.transform(test_labels_str)

print('labels have been binarized')
print(f'example binarized training labels: {train_labels[1]}')
print(f'label mapping: {list(mlb.classes_)}')

labels have been binarized
example binarized training labels: [1 0 0 0 1]
label mapping: ['food', 'service', 'price', 'ambience', 'anecdotes/miscellaneous']


In [5]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=5000
)

In [6]:
X_train_tfidf = vectorizer.fit_transform(train_texts)
X_test_tfidf = vectorizer.transform(test_texts)

print(f'shape of training data matrix: {X_train_tfidf.shape}')

shape of training data matrix: (3044, 4152)


In [7]:
svm_classifier =SVC(kernel='linear', probability=True, C=1.0)
ovr_classifier = OneVsRestClassifier(svm_classifier)

In [8]:
print('starting baseline model training')
start_time = time.time()

ovr_classifier.fit(X_train_tfidf, train_labels)

end_time = time.time()
print(f'training finished in {end_time - start_time:.2f} seconds')

starting baseline model training
training finished in 5.27 seconds


In [9]:
print('evaluating model on test set')

y_pred = ovr_classifier.predict(X_test_tfidf)

f1 = f1_score(test_labels, y_pred, average='micro')
print(f'F1-Score: {f1}')

print('\n--- Classification Report (Per Category) ---')
print(classification_report(test_labels, y_pred, target_names=CATEGORIES))

evaluating model on test set
F1-Score: 0.8238453276047261

--- Classification Report (Per Category) ---
                         precision    recall  f1-score   support

                   food       0.94      0.84      0.89       418
                service       0.96      0.79      0.87       172
                  price       0.98      0.70      0.82        83
               ambience       0.92      0.58      0.71       118
anecdotes/miscellaneous       0.81      0.65      0.72       234

              micro avg       0.92      0.75      0.82      1025
              macro avg       0.92      0.71      0.80      1025
           weighted avg       0.92      0.75      0.82      1025
            samples avg       0.82      0.77      0.78      1025



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [10]:
import pandas as pd 

In [11]:
print('saving model predictions to CSV')
baseline_predicitons_df = pd.DataFrame(y_pred, columns=mlb.classes_)
baseline_predicitons_df.to_csv('svm_predictions.csv', index=False)

saving model predictions to CSV
