In [67]:
import pandas as pd
import json
from nltk.corpus import wordnet as wn
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

MAX_WORD_LENGHT=30
MAX_DESCRIPTION_LENGHT=300
MAX_WORDNET_HEIGHT=20
MAX_NUM_EXAMPLES=20
MAX_WN_PARENTS=10
MAX_WN_CHILDRENS=10

In [68]:
def json_file_to_df(filename):
    # Leggi il file JSON
    with open(filename, 'r') as file:
        data = file.read()
    # Parsifica il JSON in un DataFrame
    data = json.loads(data)
    # Converte gli oggetti non di tipo Series in Series
    for key, value in data.items():
        if not isinstance(value, pd.Series):
            data[key] = pd.Series(value)

    # Crea il DataFrame
    df = pd.DataFrame(data)
    return df

def elaborate_dataset(dataset):
    result = []
    for synset in dataset[:-2]:
        result.append([synset.split(":")[0].replace("Synset('","").replace("')",""),0])
    return result

def calc_basicness_score(dataset,df):
    for index, tag in enumerate(df[:-2]):
        if tag == "basic":
            dataset[index][1] += 1
    return dataset

def calc_avg(dataset, test_number):
     for index, score in enumerate(dataset):
         dataset[index][1] = dataset[index][1]/test_number
         if(dataset[index][1]>=0.5):
             dataset[index].append(1)
         else:
             dataset[index].append(0)
     return dataset

## Load the data and calculate avg scores for Basic/Advanced, we calculate the float score and the possible predicition (0/1) to use with SVM

In [69]:
df = json_file_to_df("dataset_basic_advanced_TLN2023/1.json")
dataset = elaborate_dataset(df["dataset"])
dataset = calc_basicness_score(dataset,df["answers"])
dataset = calc_basicness_score(dataset,json_file_to_df("dataset_basic_advanced_TLN2023/2.json")["answers"])
dataset = calc_basicness_score(dataset,json_file_to_df("dataset_basic_advanced_TLN2023/3.json")["answers"])
dataset = calc_basicness_score(dataset,json_file_to_df("dataset_basic_advanced_TLN2023/4.json")["answers"])
dataset = calc_basicness_score(dataset,json_file_to_df("dataset_basic_advanced_TLN2023/5.json")["answers"])
dataset = calc_basicness_score(dataset,json_file_to_df("dataset_basic_advanced_TLN2023/6.json")["answers"])
dataset = calc_basicness_score(dataset,json_file_to_df("dataset_basic_advanced_TLN2023/7.json")["answers"])
dataset = calc_basicness_score(dataset,json_file_to_df("dataset_basic_advanced_TLN2023/8.json")["answers"])
dataset = calc_basicness_score(dataset,json_file_to_df("dataset_basic_advanced_TLN2023/9.json")["answers"])
dataset = calc_basicness_score(dataset,json_file_to_df("dataset_basic_advanced_TLN2023/10.json")["answers"])
dataset = calc_avg(dataset,10)

print(dataset)


[['war.n.01', 1.0, 1], ['fiefdom.n.01', 0.0, 0], ['bed.n.03', 1.0, 1], ['return_on_invested_capital.n.01', 0.0, 0], ['texture.n.02', 0.8, 1], ['news.n.01', 1.0, 1], ['look.n.02', 1.0, 1], ['caddy.n.01', 0.2, 0], ['weeder.n.01', 0.0, 0], ['avenue.n.02', 0.4, 0], ['adar.n.01', 0.0, 0], ['bedtime.n.01', 0.8, 1], ['inversion.n.08', 0.7, 1], ['yak.n.01', 0.1, 0], ['breath.n.05', 0.9, 1], ['executive_clemency.n.01', 0.0, 0], ['muse.n.02', 0.3, 0], ['effect.n.06', 1.0, 1], ['quickening.n.02', 0.1, 0], ['sleeper.n.09', 0.5, 1], ['caravanning.n.01', 0.0, 0], ['jotter.n.01', 0.0, 0], ['armageddon.n.02', 0.2, 0], ['compass_point.n.01', 0.1, 0], ['blackwater_fever.n.01', 0.0, 0], ['respect.n.03', 1.0, 1], ['position.n.06', 1.0, 1], ['message.n.02', 1.0, 1], ['arrest.n.02', 0.8, 1], ['motivation.n.01', 0.9, 1], ['day.n.04', 1.0, 1], ['nose_cone.n.01', 0.0, 0], ['discussion.n.02', 0.9, 1], ['glow.n.05', 0.5, 1], ['alcalde.n.01', 0.0, 0], ['draft_board.n.01', 0.2, 0], ['multitude.n.03', 0.4, 0], ['ho

In [70]:
def normalize(value, min_val, max_val):
    return (value - min_val) / (max_val - min_val)

def extract_synset_features(synset):
    # Lunghezza della descrizione
    if(len(synset.definition())>MAX_DESCRIPTION_LENGHT):
        desc_length=1
    else:
        desc_length = normalize(len(synset.definition()),0,MAX_DESCRIPTION_LENGHT)

    # Numero di esempi
    if(len(synset.examples())>MAX_NUM_EXAMPLES):
        num_examples=1
    else:
        num_examples = normalize(len(synset.examples()),0,MAX_NUM_EXAMPLES)

    # Altezza dell'albero WordNet
    if(synset.max_depth()>MAX_WORDNET_HEIGHT):
        wd_height=1
    else:
        wd_height = normalize(synset.max_depth(),0,MAX_WORDNET_HEIGHT)

    # Lunghezza della parola
    if(len(synset.lemmas()[0].name())>MAX_WORD_LENGHT):
        word_length=1
    else:
        word_length = normalize(len(synset.lemmas()[0].name()),1,MAX_WORD_LENGHT)

    # Lunghezza media degli esempi
    if((sum(len(example) for example in synset.examples()) / num_examples if num_examples > 0 else 0)>MAX_DESCRIPTION_LENGHT):
        examples_avg_length=1
    else:
        examples_avg_length = normalize((sum(len(example) for example in synset.examples()) / num_examples if num_examples > 0 else 0),0,300)

    # Numero di padri
    if(len(synset.hypernym_paths()[0])>MAX_WN_PARENTS):
        num_parents=1
    else:
        num_parents = normalize(len(synset.hypernym_paths()[0]) - 1,0,MAX_WN_PARENTS)

    # Numero di figli
    if(len(synset.hyponyms())>MAX_WN_CHILDRENS):
        num_children=1
    else:
        num_children = normalize(len(synset.hyponyms()),0,MAX_WN_CHILDRENS)

    return [desc_length, num_examples, wd_height, word_length, examples_avg_length, num_parents, num_children]

def extract_fetures_from_dataset(dataset):
    for index, score in enumerate(dataset):
        synset = wn.synset(dataset[index][0])
        dataset[index].append(extract_synset_features(synset))

    return dataset


## Extract normalized features from wordnet

In [71]:
dataset = extract_fetures_from_dataset(dataset)
print(dataset)

[['war.n.01', 1.0, 1, [0.15, 0.05, 0.35, 0.06896551724137931, 1, 0.7, 0.9]], ['fiefdom.n.01', 0.0, 0, [0.12666666666666668, 0.0, 0.3, 0.20689655172413793, 0.0, 0.6, 0.0]], ['bed.n.03', 1.0, 1, [0.17666666666666667, 0.05, 0.25, 0.06896551724137931, 1, 0.5, 0.4]], ['return_on_invested_capital.n.01', 0.0, 0, [0.6766666666666666, 0.0, 0.3, 0.8620689655172413, 0.0, 0.6, 0.0]], ['texture.n.02', 0.8, 1, [0.11333333333333333, 0.05, 0.45, 0.20689655172413793, 1, 0.9, 0.0]], ['news.n.01', 1.0, 1, [0.15, 0.05, 0.25, 0.10344827586206896, 1, 0.5, 0.3]], ['look.n.02', 1.0, 1, [0.24333333333333335, 0.2, 0.4, 0.10344827586206896, 1, 0.8, 1]], ['caddy.n.01', 0.2, 0, [0.07, 0.0, 0.4, 0.13793103448275862, 0.0, 0.8, 0.0]], ['weeder.n.01', 0.0, 0, [0.10666666666666667, 0.0, 0.65, 0.1724137931034483, 0.0, 1, 0.0]], ['avenue.n.02', 0.4, 0, [0.09666666666666666, 0.0, 0.45, 0.1724137931034483, 0.0, 0.9, 0.0]], ['adar.n.01', 0.0, 0, [0.41333333333333333, 0.0, 0.35, 0.10344827586206896, 0.0, 0.7, 0.0]], ['bedtim

## Split test and training set

In [72]:
#We save the x_values with the labels to eventually check on wich instances it fails
x_values=[[row[0],row[1],row[3]] for row in dataset]
y_values=[row[2] for row in dataset]
train_features_with_name, test_features_with_name, train_labels, test_labels = train_test_split(x_values, y_values, test_size=0.2, random_state=22)
#We remove the labels from the features that we give to the SVM
train_features=[row[2] for row in train_features_with_name]
test_features=[row[2] for row in test_features_with_name]


## Train SVM

In [73]:
classifier = svm.SVC(kernel='linear')
classifier.fit(train_features, train_labels)

labels_pred=classifier.predict(test_features)
accuracy = accuracy_score(test_labels, labels_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8118811881188119


## Print erroneus predictions with their original float score to check if the training set was unsure about their placement

In [74]:
for index,pred in enumerate(labels_pred):
    if(pred!=test_labels[index]):
        print(test_features_with_name[index][0],test_features_with_name[index][1])

esteem.n.01 0.1
holy_order.n.01 0.1
link.n.06 1.0
pinpoint.n.01 0.0
armageddon.n.02 0.2
breeze.n.01 0.2
hired_hand.n.01 0.0
exemplar.n.01 0.4
narrow.n.01 0.5
contingent.n.02 0.2
flower.n.03 1.0
committee.n.01 0.4
wiggliness.n.01 0.0
administrator.n.01 0.6
mull.n.01 0.1
footrace.n.01 0.0
infantry.n.01 0.2
workshop.n.02 0.5
prodigy.n.03 0.1
