In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import ast

# Read Data

In [2]:
# Function to convert the string representation of lists to actual lists
def convert_to_list(row):
    return ast.literal_eval(row)

# # Sample data
# data = =pd.read_csv() pd.DataFrame({
#     'text': ["Serves really good sushi .",
#              "Not the biggest portions but adequate .",
#              "Green Tea creme brulee is a must !",
#              "It has great sushi and even better service .",
#              "The entire staff was extremely accomodating and tended to my every need .",
#              "The owner is belligerent to guests that have a complaint ."],
#     'aspect': [['sushi'], ['portions', 'portions'], ['Green Tea creme brulee'], ['sushi', 'service'], ['staff'], ['owner']],
#     'sentiment': [['POS'], ['NEU', 'NEU'], ['POS'], ['POS', 'POS'], ['POS'], ['NEG']]
# })
data = pd.read_csv("processed_ASTE-Data-V2/16res_train_original.csv")
# Apply the converter function to the 'Aspect_Category' column
data['aspect'] = data['aspect'].apply(convert_to_list)
data['sentiment'] = data['sentiment'].apply(convert_to_list)

# Convert sentiment labels to numerical representations
sentiments = {'POS': 0, 'NEU': 1, 'NEG': 2}
data['sentiment'] = data['sentiment'].apply(lambda x: [sentiments[s] for s in x])

# Flatten the aspect and sentiment lists for multi-label classification
data_flat = data.explode('aspect').explode('sentiment').reset_index(drop=True)


In [3]:
data_flat

Unnamed: 0,text,aspect,opinion,sentiment
0,Judging from previous posts this used to be a ...,place,['good'],2
1,"We , there were four of us , arrived at noon -...",staff,['rude'],2
2,The food was lousy - too sweet or too salty an...,food,"['lousy', 'too sweet', 'too salty', 'tiny']",2
3,The food was lousy - too sweet or too salty an...,food,"['lousy', 'too sweet', 'too salty', 'tiny']",2
4,The food was lousy - too sweet or too salty an...,food,"['lousy', 'too sweet', 'too salty', 'tiny']",2
...,...,...,...,...
2979,bad staff,staff,['bad'],2
2980,I generally like this place .,place,['like'],0
2981,The food is good .,food,['good'],0
2982,The design of the space is good .,space,['good'],0


# Preprocessing

In [4]:
# Preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Encode text and aspects
inputs = tokenizer(data_flat['text'].tolist(), padding=True, truncation=True, return_tensors="pt")
aspect_embeddings = model(**inputs)['last_hidden_state']

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 843350016 bytes.

In [None]:
aspect_embeddings.shape

torch.Size([857, 92, 768])

In [None]:
data_flat


Unnamed: 0,text,aspect,opinion,sentiment
0,Judging from previous posts this used to be a ...,place,['good'],2
1,"We , there were four of us , arrived at noon -...",staff,['rude'],2
2,The food was lousy - too sweet or too salty an...,food,"['lousy', 'too sweet', 'too salty', 'tiny']",2
3,The food was lousy - too sweet or too salty an...,food,"['lousy', 'too sweet', 'too salty', 'tiny']",2
4,The food was lousy - too sweet or too salty an...,food,"['lousy', 'too sweet', 'too salty', 'tiny']",2
...,...,...,...,...
2979,bad staff,staff,['bad'],2
2980,I generally like this place .,place,['like'],0
2981,The food is good .,food,['good'],0
2982,The design of the space is good .,space,['good'],0


In [None]:
# Prepare training and testing data
X_train, X_test, y_train, y_test = train_test_split(aspect_embeddings, data_flat['sentiment'], test_size=0.2, random_state=42)

# Train an SVM classifier
classifier = SVC()
classifier.fit(X_train, y_train)


ValueError: Found input variables with inconsistent numbers of samples: [857, 2984]

In [None]:

# Make predictions
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:


# Predict the sentiment for new inputs
new_text = ["The sushi was amazing but the service was slow."]
new_aspect = [['sushi', 'service']]

new_inputs = tokenizer(new_text, padding=True, truncation=True, return_tensors="pt")
new_aspect_embeddings = model(**new_inputs)['last_hidden_state']

new_pred = classifier.predict(new_aspect_embeddings)
print(f"Predicted sentiment: {new_pred}")