# Train a model on linguistic features in order to classify

In [1]:
import os
import json
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, classification_report

# Data pre-processing

In [2]:
basline_directory = "baseline"
dataset_dir = 'vua_dataset'
train_file = os.path.join(dataset_dir, 'vua20_metaphor_train.json')
test_file = os.path.join(dataset_dir, 'vua20_metaphor_test.json')

In [3]:
# Load the bert embeddings
bert_embeddings = np.load(os.path.join('bert_embeddings', 'bert_embeddings.npz'))

In [4]:
X_train = bert_embeddings['train_embeddings']
X_test = bert_embeddings['test_embeddings']
y_train = bert_embeddings['train_labels']
y_test = bert_embeddings['test_labels']

In [5]:
with open(train_file, 'r', encoding='utf-8') as f:
    train_data = [json.loads(line) for line in f]
with open(test_file, 'r', encoding='utf-8') as f:
    test_data = [json.loads(line) for line in f]

In [6]:
def prepare_features(dataset, embeddings, vectorizer=None):
    pos_features = [{'POS': item['POS'], 'FGPOS': item['FGPOS']} for item in dataset]
    labels = np.array([item['label'] for item in dataset])
    if vectorizer is None:
        vectorizer = DictVectorizer(sparse=False)
        pos_features_vectorized = vectorizer.fit_transform(pos_features)
    else:
        pos_features_vectorized = vectorizer.transform(pos_features)
    combined_features = np.hstack([embeddings, pos_features_vectorized])
    return combined_features, labels, vectorizer

In [7]:
train_combined_features, train_labels, vectorizer = prepare_features(train_data, X_train)

In [8]:
test_combined_features, test_labels, _ = prepare_features(test_data, X_test, vectorizer)

# Models using only parts of speech 

In [44]:
def train_model(model_class, X_train, y_train, **kwargs):
    model = model_class(**kwargs)
    model.fit(X_train, y_train)
    return model

In [45]:
def predict_and_evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    return predictions, accuracy, report

In [46]:
def print_results(accuracy, report):
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

## Logistic Regression

In [47]:
lr_model_pos = train_model(LogisticRegression, X_train, y_train, max_iter=1000, class_weight='balanced')

In [48]:
lr_pos_predictions, lr_pos_accuracy, lr_pos_report = predict_and_evaluate(lr_model_pos, X_test, y_test)
print_results(lr_pos_accuracy, lr_pos_report)

Accuracy: 0.6442151739052081
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.64      0.75     18214
           1       0.29      0.65      0.40      3982

    accuracy                           0.64     22196
   macro avg       0.59      0.65      0.57     22196
weighted avg       0.79      0.64      0.68     22196



## Random Forest

In [49]:
rf_model_pos = train_model(RandomForestClassifier, X_train, y_train, n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)

In [50]:
rf_pos_predictions, rf_pos_accuracy, rf_pos_report = predict_and_evaluate(rf_model_pos, X_test, y_test)
print_results(rf_pos_accuracy, rf_pos_report)

Accuracy: 0.8205983060010813
Classification Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     18214
           1       0.50      0.00      0.00      3982

    accuracy                           0.82     22196
   macro avg       0.66      0.50      0.45     22196
weighted avg       0.76      0.82      0.74     22196



## KNN

In [51]:
knn_model_pos = train_model(KNeighborsClassifier, X_train, y_train, n_neighbors=5, weights='distance')

In [55]:
knn_pos_predictions, knn_pos_accuracy, knn_pos_report = predict_and_evaluate(knn_model_pos, X_test, y_test)
print_results(knn_pos_accuracy, knn_pos_report)

Accuracy: 0.8190664984681925
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.97      0.90     18214
           1       0.48      0.13      0.20      3982

    accuracy                           0.82     22196
   macro avg       0.66      0.55      0.55     22196
weighted avg       0.77      0.82      0.77     22196



## SVM

In [53]:
svm_model_pos = train_model(LinearSVC, X_train, y_train, max_iter=1000, class_weight='balanced')

In [54]:
svm_pos_predictions, svm_pos_accuracy, svm_pos_report = predict_and_evaluate(svm_model_pos, X_test, y_test)
print_results(svm_pos_accuracy, svm_pos_report)

Accuracy: 0.6436745359524239
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.64      0.75     18214
           1       0.28      0.65      0.40      3982

    accuracy                           0.64     22196
   macro avg       0.59      0.65      0.57     22196
weighted avg       0.78      0.64      0.68     22196



In [56]:
os.makedirs("baseline_models", exist_ok=True)

# Save models
joblib.dump(lr_model_pos, "baseline_models/logistic_regression_pos.joblib")
joblib.dump(rf_model_pos, "baseline_models/random_forest_pos.joblib")
joblib.dump(knn_model_pos, "baseline_models/knn_pos.joblib")
joblib.dump(svm_model_pos, "baseline_models/svm_pos.joblib")

['baseline_models/svm_pos.joblib']

# Models using parts of speech and embeddings

## Logistic Regression

In [57]:
lr_model = train_model(LogisticRegression, train_combined_features, train_labels, max_iter=1000, class_weight='balanced')

In [71]:
lr_predictions, lr_accuracy, lr_report = predict_and_evaluate(lr_model, test_combined_features, test_labels)
print_results(lr_accuracy, lr_report)

Accuracy: 0.6033519553072626
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.57      0.70     18214
           1       0.28      0.78      0.41      3982

    accuracy                           0.60     22196
   macro avg       0.60      0.67      0.56     22196
weighted avg       0.81      0.60      0.65     22196



## Random Forest

In [None]:
rf_model = train_model(RandomForestClassifier, train_combined_features, train_labels, n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)

NameError: name 'train_model' is not defined

In [60]:
rf_predictions, rf_accuracy, rf_report = predict_and_evaluate(rf_model, test_combined_features, test_labels)
print_results(rf_accuracy, rf_report)

Accuracy: 0.8210488376284015
Classification Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     18214
           1       0.92      0.00      0.01      3982

    accuracy                           0.82     22196
   macro avg       0.87      0.50      0.45     22196
weighted avg       0.84      0.82      0.74     22196



## KNN

In [61]:
knn_model = train_model(KNeighborsClassifier, train_combined_features, train_labels, n_neighbors=5, weights='distance')

In [62]:
knn_predictions, knn_accuracy, knn_report = predict_and_evaluate(knn_model, test_combined_features, test_labels)
print_results(knn_accuracy, knn_report)

Accuracy: 0.8201477743737611
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.97      0.90     18214
           1       0.50      0.13      0.21      3982

    accuracy                           0.82     22196
   macro avg       0.67      0.55      0.55     22196
weighted avg       0.78      0.82      0.77     22196



## SVM

In [63]:
svm_model = train_model(LinearSVC, train_combined_features, train_labels, max_iter=1000, class_weight='balanced')

In [64]:
svm_predictions, svm_accuracy, svm_report = predict_and_evaluate(svm_model, test_combined_features, test_labels)
print_results(svm_accuracy, svm_report)

Accuracy: 0.5969544061993152
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.56      0.69     18214
           1       0.28      0.78      0.41      3982

    accuracy                           0.60     22196
   macro avg       0.60      0.67      0.55     22196
weighted avg       0.81      0.60      0.64     22196



In [65]:
os.makedirs("baseline_models", exist_ok=True)

# Save models
joblib.dump(lr_model, "baseline_models/logistic_regression_pos_and_embeddings.joblib")
joblib.dump(rf_model, "baseline_models/random_forest_pos_and_embeddings.joblib")
joblib.dump(knn_model, "baseline_models/knn_pos_and_embeddings.joblib")

['baseline_models/knn_pos_and_embeddings.joblib']

# Error Analysis

In [30]:
def get_misclassified(predictions, test_labels=test_labels):
    return set(i for i, (p, t) in enumerate(zip(predictions, test_labels)) if p != t)

In [73]:
# Collect misclassified indices for each model
lr_errors = get_misclassified(lr_predictions)
rf_errors = get_misclassified(rf_predictions)
knn_errors = get_misclassified(knn_predictions)
svm_errors = get_misclassified(svm_predictions)

# Intersection: indices misclassified by all models
common_errors = lr_errors & rf_errors & knn_errors & svm_errors
print(f"Num of common misclassified errors: {len(common_errors)}")
# Check if all common errors are on figurative (label 1) data points
all_figurative = all(test_data[i]['label'] == 1 for i in common_errors)
print(f"All common errors are figurative: {all_figurative}")
print(f"Number of figurative data points is: {sum(1 for entry in test_data if entry['label'] == 1)}")


# # Print sentences for these indices
# for i in common_errors:
#     entry = test_data[i]
#     print(
#         f"Sentence: {entry['sentence']}\n"
#         f"Word index: {entry['w_index']}\n"
#         f"POS: {entry['POS']}, FGPOS: {entry['FGPOS']}\n"
#         f"True label: {entry['label']}\n"
#         "-----"
#     )

Num of common misclassified errors: 800
All common errors are figurative: True
Number of figurative data points is: 3982


## Logistic Regression

In [74]:
print(f"Num of errors: {len(lr_errors)}")
print(f"Number of figurative classified as literal is: {sum(1 for i in lr_errors if test_data[i]['label'] == 1)}")

Num of errors: 8804
Number of figurative classified as literal is: 895


## Random Forest

In [75]:
print(f"Num of errors: {len(rf_errors)}")
print(f"Number of figurative classified as literal is: {sum(1 for i in rf_errors if test_data[i]['label'] == 1)}")

Num of errors: 3972
Number of figurative classified as literal is: 3971


## KNN

In [76]:
print(f"Num of errors: {len(knn_errors)}")
print(f"Number of figurative classified as literal is: {sum(1 for i in knn_errors if test_data[i]['label'] == 1)}")

Num of errors: 3992
Number of figurative classified as literal is: 3455


## SVM

In [77]:
print(f"Num of errors: {len(svm_errors)}")
print(f"Number of figurative classified as literal is: {sum(1 for i in svm_errors if test_data[i]['label'] == 1)}")

Num of errors: 8946
Number of figurative classified as literal is: 859
