# Laboratorio 1.5
Non ciclare su tutti i synset, ma sfruttare il meccanismo del "genus": parte dal presupposto di localizzare un concetto a partire dal suo iperonimo per poi aggiungere proprietà che lo caratterizzano.

In [41]:
from nltk.corpus import wordnet as wn, brown, stopwords
from nltk import word_tokenize, SnowballStemmer
import spacy #pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz # CONTROLLA VERSIONE CHE SERVE LA 3.0/3.1.0!
from nltk.stem import SnowballStemmer
import matplotlib.pylab as plt
import pandas as pd
import string
from operator import itemgetter


snow_stemmer = SnowballStemmer(language='english')

# Crea la variabile stop_words con le stop word e con la punteggiatura
stop_words = set(stopwords.words('english'))
stop_words.update(set(string.punctuation))

In [42]:
def extract_corpus(path='dataset/defs.csv'):
    if path == 'dataset/defs.csv':
        df = pd.read_csv(path, header=0)
        df = df.dropna()
        df.drop(['Partecipante'],axis=1,inplace=True)
        data = df.to_dict()

    elif 'dataset/db.csv':
        df = pd.read_csv(path, header=0, sep=',')
        data = df.values.tolist()

    return data

In [43]:
def sort_indexes(data):
    defs = {}

    for concept in data.keys():
        i = 0
        defs.setdefault(concept,{})
        for index in data[concept]:
            defs[concept].setdefault(i, data[concept][index])
            i += 1
    return defs

In [44]:
def pre_processing(data):
    for names in data.keys():
        for index in data[names]:
            data[names][index] = word_tokenize(data[names][index].lower())

    for names in data.keys():
        for index in data[names]:
            temp = []
            for token in data[names][index]:
                if token not in stop_words:
                    temp.append(token)
            data[names][index] = temp
    return data

In [45]:
# Estrazione dei dati dei concetti in un dictionary
data = sort_indexes(extract_corpus())

# Pre-processing dell'input
data = pre_processing(data)

{'Courage': {0: ['property', 'allows', 'face', 'situation', 'despite', 'feeling', 'fear'], 1: ['ability', 'face', 'fears', 'something', 'scars', 'us', 'makes', 'us', 'unpleasent'], 2: ['ability', 'face', 'thing', 'without', 'fear'], 3: ['inner', 'strength', 'thaht', 'allow', 'face', 'particular', 'situations'], 4: ['ability', 'control', 'fear'], 5: ['ability', 'control', 'fear', 'willing', 'deal', 'something', 'unpleasant'], 6: ['ability', 'avoid', 'fear', 'take', 'risky', 'actions'], 7: ['abiliity', 'make', 'choices', 'take', 'action', 'without', 'fear'], 8: ['able', 'something', 'fearful'], 9: ['ability', 'something', 'despite', 'frightened'], 10: ['ability', 'something', 'scares', 'people'], 11: ['feeling', 'allows', 'us', 'face', 'situations', 'considered', 'dangerous'], 12: ['ability', 'something', 'may', 'scary'], 13: ['ability', 'make', 'drastic', 'choices'], 14: ['ability', 'overcome', 'fear'], 15: ['characteristic', 'person', 'taking', 'risk'], 16: ['quality', 'able', 'things'

In [46]:
def generate_most_frequent_words(definitions, keep=10):
    frequent_words = {}
    temp = []
    for definition_index in definitions.keys():

        for word in definitions[definition_index]:

            if word not in frequent_words:
                frequent_words.setdefault(word, [0] * len(definitions.values()))
                frequent_words[word][definition_index] += 1
            
            else: 
                frequent_words[word][definition_index] += 1


    for word in frequent_words.keys():
        temp.append([word, sum(frequent_words[word])])


    # Le parole vengono aggiunte all'interno del dictionary e vengono riordinate
    temp = sorted(temp, key=itemgetter(1), reverse=True)

    del frequent_words
    frequent_words = {}

    i = 0
    for tpl in temp: 
        if i < keep:
            frequent_words.setdefault(tpl[0], tpl[1])
            i += 1
    
    return frequent_words

In [47]:
freq = generate_most_frequent_words(data['Apprehension'])
print(freq)

{'something': 10, 'fear': 10, 'anxiety': 10, 'state': 7, 'feeling': 6, 'happen': 5, 'bad': 4, 'mind': 3, 'mental': 3, 'person': 3}


In [48]:
def evaluate_corpus(corpus, concept, synsets):
    if synsets != []:
        winners = {}
        winners.setdefault('synsets', [])
        winners.setdefault('matches', [])

        max_overlap = -1
        senses = []
        matches = []
        sentences = set()

        for synset in synsets:
            signature = []
            definition = set(synset.definition().split())
            examples = synset.examples()

            for example in examples:
                definition.update(example.split())


            for index in corpus[concept]:
                sentence = set(corpus[concept][index])
                sentences.update(sentence)
            overlap = len(sentences & definition)
                #print(overlap)

            # La definizione con più overlapping sarà l'output
            if overlap > max_overlap:
                senses.append(synset)
                matches.append(overlap)
                max_overlap = overlap

        winners = {'synsets': senses, 'matches': matches}

        return winners # 'Synset_vincitori': [lista_di_synset], 'numero di match': n

    else: 
        print(synsets)
        winners = {'synsets': ['error'], 'matches': [-1]}
        
        return winners

{'synsets': [Synset('condition.n.01')], 'matches': [3]}

In [None]:
test = wn.synsets('condition')
print(evaluate_corpus(data, 'Apprehension', test))

In [49]:
def generate_hypernyms(concept, max_depth=2, corpus=data):
    temp = {}
    winners = {}
    hyponyms = []
    words = generate_most_frequent_words(corpus[concept])
    
    for word in words.keys():
        temp.setdefault(word, wn.synsets(word))
    
    depth = 0

    while depth < max_depth:

        for word in words.keys():
            hyponyms = []
            for synset_id in range(len(temp[word])):
                syn = [temp[word][synset_id]]
                temp[word][synset_id] = []
                
                # Se vi è un solo synset allora questo diventa una lista di iponimi
                for element in syn:
                    if type(element) == list:
                        
                        synsets_list = element[0]

                        for element in synsets_list:
                            if element.hyponyms() != []:
                                for hyponym in element.hyponyms():
                                    hyponyms.append(hyponym)

                    elif element.hyponyms() != []:
                        for hyponym in element.hyponyms():
                            hyponyms.append(hyponym)
                    
                temp[word][synset_id].append(hyponyms)
            
            if hyponyms == []:
                pass

            else:
                winners.setdefault(depth, evaluate_corpus(corpus, concept, hyponyms))
            

        depth += 1
    return winners

In [50]:
print(generate_hypernyms('Apprehension', max_depth=4))

{0: {'synsets': [Synset('alarm.n.01'), Synset('apprehension.n.01')], 'matches': [1, 3]}, 1: {'synsets': [Synset('chill.n.04'), Synset('foreboding.n.01')], 'matches': [0, 2]}, 2: {'synsets': [Synset('presage.n.01'), Synset('hesitance.n.01')], 'matches': [1, 2]}, 3: {'synsets': [Synset('fact.n.03'), Synset('insecureness.n.01'), Synset('pass.n.08'), Synset('disturbance.n.02')], 'matches': [1, 2, 3, 4]}}


In [51]:
def generate_genus(corpus=data, depth=2, keep=10):
    genus = {}

    for concept in corpus.keys():
        words = generate_most_frequent_words(corpus[concept], keep=keep)
        genus.setdefault(concept, generate_hypernyms(concept, max_depth=depth))

    for concept in genus.keys():
        max_score = -1
        for i in genus[concept]:
            synsets = genus[concept][i]['synsets']
            score = sum(genus[concept][i]['matches'])
            
            if genus[concept][i]['matches'] == []:
                pass

            if score > max_score:
                final_synsets = synsets
                max_score = score

        genus[concept] = final_synsets

    return genus

In [52]:
print(generate_genus(depth=3, keep=5))

{'Courage': [Synset('adaptability.n.01'), Synset('form.n.14'), Synset('penetration.n.04')], 'Paper': [Synset('carborundum.n.01'), Synset('steel_wool.n.01'), Synset('binder.n.02'), Synset('carbon_paper.n.01'), Synset('writing_paper.n.01')], 'Apprehension': [Synset('alarm.n.01'), Synset('apprehension.n.01')], 'Sharpener': [Synset('abrading_stone.n.01'), Synset('blade.n.09'), Synset('dibble.n.01'), Synset('wire_stripper.n.01')]}
