In [6]:
import re #pacchetto per espressioni regolari
import os #pacchetto per muoversi nelle cartelle

In [29]:
conllu_dir = 'profiling_output/11152/'

In [3]:
class Document:

    def __init__(self, document_path):
        self.document_path = document_path
        self._parse_doc_info(document_path)
        self.sentences = []
        self.features = None

    def _parse_doc_info(self, document_path):
        document_path = document_path.split('/')[-1]
        document_info = document_path.split('.')[0]
        document_info = document_info.split('#')
        self.split = document_info[0]
        self.genre = document_info[2]
        self.gender = document_info[3]

    def add_sentence(self, sentences):
        self.sentences.append(sentences)

    # Per dopo

    def get_num_tokens(self):
        num_words = 0
        for sentence in self.sentences:
            num_words = num_words + sentence.get_num_tokens()
        return num_words

    def get_num_chars(self):
        num_chars = 0
        for sentence in self.sentences:
            sentence_char_len = sentence.get_num_chars()
            num_chars = num_chars + sentence_char_len
        return num_chars

class Sentence:

    def __init__(self):
        self.tokens = []

    def add_token(self, token):
        self.tokens.append(token)

    # Per dopo

    def get_words(self):
        return [token.word for token in self.tokens]

    def get_lemmas(self):
        return [token.lemma for token in self.tokens]

    def get_pos(self):
        return [token.pos for token in self.tokens]

    def get_num_tokens(self):
        return len(self.tokens)

    def get_num_chars(self):
        num_chars = 0
        for token in self.tokens:
            num_chars = num_chars + token.get_num_chars()
        num_chars = num_chars + self.get_num_tokens() - 1 # contiamo anche gli spazi
        return num_chars

    def __str__(self):
        return ' '.join([token.word for token in self.tokens])

class Token:

    def __init__(self, word, lemma, pos):
        self.word = word
        self.lemma = lemma
        self.pos = pos


    # Per dopo

    def get_num_chars(self):
        return len(self.word)

In [4]:
def load_document_sentences(document):
    sentence = Sentence()
    for line in open(document.document_path, 'r'):
        if line[0].isdigit():  # se la riga inizia con un numero
            splitted_line = line.strip().split('\t')
            if '-' not in splitted_line[0]:  # se l'id della parola non contiene un trattino
                token = Token(splitted_line[1], splitted_line[2], splitted_line[3])
                sentence.add_token(token)
        if line == '\n':  # se la riga è vuota significa che la frase è finita
            document.add_sentence(sentence)
            sentence = Sentence()

In [31]:
all_documents = []
for file_name in os.listdir(conllu_dir):
    file_path = os.path.join(conllu_dir, file_name)
    document = Document(file_path)
    load_document_sentences(document)
    all_documents.append(document)

In [37]:
sample_document = all_documents[0]
for sentence in sample_document.sentences[:2]:
    tokens = [token.word for token in sentence.tokens]
    print(' - '.join(tokens))
    print('\n_________________\n')

" - Gay - Pride - ?

_________________

Solo - folklore - " - .

_________________



In [38]:
sample_sentence = sample_document.sentences[0]
print(sample_sentence)

" Gay Pride ?


In [39]:
def extract_word_ngrams_from_sentence(word_ngrams, sentence, el, n):
    # creiamo una lista con tutte le parole
    if el == 'word':
        all_words = sentence.get_words()
    elif el == 'lemma':
        all_words = sentence.get_lemmas()
    elif el == 'pos':
        all_words = sentence.get_pos()
    else:
        raise Exception(f'Invalid element {el}')
    

    # scorriamo la lista delle parole ed estraiamo gli n-grammi
    for i in range(0, len(all_words) - n + 1): # -n+1 serve per non uscire dal vettore
        ngram_words = all_words[i: i + n]
        ngram = f'{el.upper()}_{n}_' + '_'.join(ngram_words)
        # print(f'{i}: {ngram_words} -> {ngram}')
        if ngram not in word_ngrams:
            word_ngrams[ngram] = 1
        else:
            word_ngrams[ngram] += 1

    return word_ngrams

In [40]:
extract_word_ngrams_from_sentence(dict(), sample_sentence, 'word', 3)


In [41]:
def extract_char_ngrams_from_sentence(char_ngrams, sentence, n):
    # creiamo una lista con tutte le parole
    all_words = sentence.get_words()

    # creiamo una stringa che contenga tutte le parole separate tra spazi perchè vogliamo scorrere i caratteri
    all_words = ' '.join(all_words)
    # print(all_words)
    # all_words = all_words.lower()

    # scorriamo la stringa ed estraiamo gli n-grammi di caratteri
    for i in range(0, len(all_words) - n + 1):
        ngram_chars = all_words[i:i + n]
        ngram = f'CHAR_{n}_' + ngram_chars
        # print(f'{i}: {ngram_chars} -> {ngram}')

        if ngram not in char_ngrams:
            char_ngrams[ngram] = 1
        else:
            char_ngrams[ngram] += 1

    return char_ngrams

In [42]:
extract_char_ngrams_from_sentence(dict(), sample_sentence, 2);


In [43]:
print(sample_document.features)


None


In [44]:
def extract_documents_ngrams(all_documents):
    for document in all_documents:
        document_ngrams = dict()
        for sentence in document.sentences:
            extract_word_ngrams_from_sentence(document_ngrams, sentence, 'word', 1)
            extract_word_ngrams_from_sentence(document_ngrams, sentence, 'word', 2)
            extract_char_ngrams_from_sentence(document_ngrams, sentence, 1)
            extract_char_ngrams_from_sentence(document_ngrams, sentence, 2)

        document.features = document_ngrams

In [45]:
extract_documents_ngrams(all_documents)


In [46]:
sample_document.features

In [47]:
def normalize_ngrams(ngrams_dict, doc_len):
    for ngram in ngrams_dict:
        ngrams_dict[ngram] = ngrams_dict[ngram] / float(doc_len)
        
def extract_documents_ngrams_normalized(all_documents):
    for document in all_documents:
        word_unigrams = dict()
        word_bigrams = dict()
        char_unigrams = dict()
        char_bigrams = dict()
        for sentence in document.sentences:
            extract_word_ngrams_from_sentence(word_unigrams, sentence, 'word', 1)
            extract_word_ngrams_from_sentence(word_bigrams, sentence, 'word', 2)
            extract_char_ngrams_from_sentence(char_unigrams, sentence, 1)
            extract_char_ngrams_from_sentence(char_bigrams, sentence, 2)

        num_words = document.get_num_tokens()
        num_chars = document.get_num_chars()
        normalize_ngrams(word_unigrams, num_words)
        normalize_ngrams(word_bigrams, num_words)
        normalize_ngrams(char_unigrams, num_chars)
        normalize_ngrams(char_bigrams, num_chars)

        document_ngrams = word_unigrams | word_bigrams | char_unigrams | char_bigrams

        # user_features = dict()
        # for ngrams_dict in [word_unigrams, word_bigrams, char_unigrams, char_bigrams]:
        #     for ngram in ngrams_dict:
        #         user_features[ngram] = ngrams_dict[ngram]

        document.features = document_ngrams

In [48]:
extract_documents_ngrams_normalized(all_documents)


In [49]:
sample_document.features

{'WORD_1_"': 0.030982905982905984,
 'WORD_1_Gay': 0.004273504273504274,
 'WORD_1_Pride': 0.01282051282051282,
 'WORD_1_?': 0.002136752136752137,
 'WORD_1_Solo': 0.002136752136752137,
 'WORD_1_folklore': 0.004273504273504274,
 'WORD_1_.': 0.03205128205128205,
 'WORD_1_A': 0.004273504273504274,
 'WORD_1_Novara': 0.01282051282051282,
 'WORD_1_il': 0.06623931623931624,
 'WORD_1_sindaco': 0.014957264957264958,
 'WORD_1_leghista': 0.003205128205128205,
 'WORD_1_nega': 0.003205128205128205,
 'WORD_1_patrocinio': 0.009615384615384616,
 'WORD_1_ed': 0.002136752136752137,
 'WORD_1_è': 0.008547008547008548,
 'WORD_1_polemica': 0.002136752136752137,
 'WORD_1_Canelli': 0.007478632478632479,
 'WORD_1_,': 0.04700854700854701,
 'WORD_1_già': 0.0010683760683760685,
 'WORD_1_a': 0.029914529914529916,
 'WORD_1_centro': 0.0010683760683760685,
 'WORD_1_di': 0.05448717948717949,
 'WORD_1_le': 0.009615384615384616,
 'WORD_1_critiche': 0.0010683760683760685,
 'WORD_1_per': 0.00641025641025641,
 'WORD_1_no': 0

In [52]:
def train_test_split(all_documents):
    train_features_dict, train_labels = [], []
    test_features_dict, test_labels = [], []
    
    for document in all_documents:
        if document.split == 'training':
            train_features_dict.append(document.features)
            train_labels.append(document.gender)
        else:
            test_features_dict.append(document.features)
            test_labels.append(document.gender)
    
    return train_features_dict, train_labels, test_features_dict, test_labels

In [53]:
train_features_dict, train_labels, test_features_dict, test_labels = train_test_split(all_documents)

In [54]:
len(train_features_dict), len(train_labels), len(test_features_dict), len(test_labels)


(200, 200, 200, 200)

In [55]:
def get_num_features(features_dict):
    all_features = set()
    for document_feats in features_dict:
        all_features.update(list(document_feats.keys()))
    return len(all_features)

In [56]:
get_num_features(train_features_dict)
print(f'Numero features: {get_num_features(train_features_dict)}')

Numero features: 96238


In [57]:
def filter_features(train_features_dict, min_occurrences):
    # contiamo ogni feature in quanti user diversi compare
    features_counter = dict()
    for document_features_dict in train_features_dict:
        for feature in document_features_dict:
            if feature in features_counter:
                features_counter[feature] += 1
            else:
                features_counter[feature] = 1

    # per ogni user, togliamo le features che compaiono in meno di "min_occurrences" utenti
    for document_features_dict in train_features_dict:
        document_features = list(document_features_dict.keys())
        for feature in document_features:
            if features_counter[feature] < min_occurrences:
                document_features_dict.pop(feature)

    return train_features_dict

In [58]:
train_features_dict = filter_features(train_features_dict, 5)
print(f'Numero features dopo il filtro: {get_num_features(train_features_dict)}')

Numero features dopo il filtro: 5754


In [59]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_features_dict)

In [62]:
vectorizer.get_feature_names_out(X_train).tolist()


['CHAR_1_ ',
 'CHAR_1_!',
 'CHAR_1_"',
 'CHAR_1_%',
 'CHAR_1_&',
 "CHAR_1_'",
 'CHAR_1_(',
 'CHAR_1_)',
 'CHAR_1_,',
 'CHAR_1_-',
 'CHAR_1_.',
 'CHAR_1_/',
 'CHAR_1_0',
 'CHAR_1_1',
 'CHAR_1_2',
 'CHAR_1_3',
 'CHAR_1_4',
 'CHAR_1_5',
 'CHAR_1_6',
 'CHAR_1_7',
 'CHAR_1_8',
 'CHAR_1_9',
 'CHAR_1_:',
 'CHAR_1_;',
 'CHAR_1_?',
 'CHAR_1_A',
 'CHAR_1_B',
 'CHAR_1_C',
 'CHAR_1_D',
 'CHAR_1_E',
 'CHAR_1_F',
 'CHAR_1_G',
 'CHAR_1_H',
 'CHAR_1_I',
 'CHAR_1_J',
 'CHAR_1_K',
 'CHAR_1_L',
 'CHAR_1_M',
 'CHAR_1_N',
 'CHAR_1_O',
 'CHAR_1_P',
 'CHAR_1_Q',
 'CHAR_1_R',
 'CHAR_1_S',
 'CHAR_1_T',
 'CHAR_1_U',
 'CHAR_1_V',
 'CHAR_1_W',
 'CHAR_1_Y',
 'CHAR_1_Z',
 'CHAR_1_a',
 'CHAR_1_b',
 'CHAR_1_c',
 'CHAR_1_d',
 'CHAR_1_e',
 'CHAR_1_f',
 'CHAR_1_g',
 'CHAR_1_h',
 'CHAR_1_i',
 'CHAR_1_j',
 'CHAR_1_k',
 'CHAR_1_l',
 'CHAR_1_m',
 'CHAR_1_n',
 'CHAR_1_o',
 'CHAR_1_p',
 'CHAR_1_q',
 'CHAR_1_r',
 'CHAR_1_s',
 'CHAR_1_t',
 'CHAR_1_u',
 'CHAR_1_v',
 'CHAR_1_w',
 'CHAR_1_x',
 'CHAR_1_y',
 'CHAR_1_z',
 'CHAR_1_~',

In [63]:
from sklearn.preprocessing import MaxAbsScaler

scaler = MaxAbsScaler()
X_train = scaler.fit_transform(X_train)

In [64]:
from sklearn.svm import LinearSVC

svc = LinearSVC(dual=True, max_iter=10000)
svc.fit(X_train, train_labels)

In [65]:
X_test = vectorizer.transform(test_features_dict)
X_test = scaler.transform(X_test)

In [66]:
from sklearn.metrics import classification_report

test_predictions = svc.predict(X_test)
print(classification_report(test_labels, test_predictions, zero_division=0))

              precision    recall  f1-score   support

           F       0.49      0.56      0.52       100
           M       0.49      0.42      0.45       100

    accuracy                           0.49       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.49      0.49      0.49       200

