In [1]:
import sqlite3
import os
import sys

dim = 64   #32 o 64
dir = 'itwac'

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

sql_path = f'{parent_dir}/{dir}/itwac{dim}.sqlite'  #(1) best
txt_path = f'{parent_dir}/{dir}/itwac{dim}.txt'
conllu_dir = 'data/profiling_output/11459'

In [2]:
sql_path

'c:\\Users\\corra\\Documents\\GitHub\\ProgettoLinCompII\\Project-of-Computational-Linguistics-II/itwac/itwac64.sqlite'

In [3]:
embeddings_dim = 64
embeddings_path = f'itwac/itwac{embeddings_dim}.txt'

In [4]:
sys.path.append('..')  # Add parent directory to the Python path
from utils.helpers import *

In [5]:
embeddings = load_word_embeddings(parent_dir+"/"+embeddings_path) 

In [9]:
embeddings['veder-']

array([-0.21470731, -0.1392023 ,  0.27303648, -0.05340238, -0.03936952,
        0.02390084, -0.02741197, -0.12491986, -0.02866316,  0.00107179,
       -0.09525209,  0.05685699,  0.01190181, -0.09539247,  0.06602568,
       -0.0245655 , -0.09080959,  0.10820474, -0.04021769,  0.01245855,
        0.19386294, -0.17447612, -0.03580143,  0.14230289,  0.22230086,
        0.00798338, -0.07439804,  0.06270457, -0.00456899, -0.37990505,
        0.06431432,  0.00839787, -0.06637963,  0.06906799, -0.30072612,
        0.27667612, -0.06807792, -0.09943178, -0.10579097,  0.02905671,
        0.20786461,  0.01789608, -0.08334571,  0.01347961,  0.06287382,
        0.01383804, -0.00558291, -0.14960463, -0.12520191,  0.16182758,
        0.07295152,  0.01592852, -0.01849817,  0.0850869 ,  0.04188532,
       -0.05708888,  0.0698403 ,  0.02712907,  0.07868258, -0.01340355,
        0.14947875, -0.33357722, -0.07842311,  0.02836373])

In [10]:
import os
all_documents_paths = []
for file_name in os.listdir(conllu_dir):
    file_path = os.path.join(conllu_dir, file_name)
    all_documents_paths.append(file_path)

In [11]:
all_documents = []

for document_path in all_documents_paths:
    document_tokens = get_tokens_from_file(document_path)
    all_documents.append(document_tokens)

In [12]:
def compute_embeddings_mean(document_embeddings):
    sum_array = np.sum(document_embeddings, axis=0)
    mean_array = np.divide(sum_array, len(document_embeddings))
    return mean_array

def compute_all_embeddings_mean(document_tokens):
    document_embeddings = []
    
    for token in document_tokens:
        word = token['word']
        if word in embeddings:
            document_embeddings.append(embeddings[word])
    
    if len(document_embeddings) == 0:
        mean_document_embeddings = np.zeros(embeddings_dim)
    else:
        mean_document_embeddings = compute_embeddings_mean(document_embeddings)
    return mean_document_embeddings


def compute_filtered_embeddings_mean(document_tokens):
    document_embeddings = []
    
    for token in document_tokens:
        word = token['word']
        pos = token['pos']
        if word in embeddings and pos in ['ADJ', 'NOUN', 'VERB']:
            document_embeddings.append(embeddings[word])
    
    if len(document_embeddings) == 0:
        mean_document_embeddings = np.zeros(embeddings_dim)
    else:
        mean_document_embeddings = compute_embeddings_mean(document_embeddings)
    return mean_document_embeddings


def compute_filtered_embeddings_sep_means(document_tokens):
    adj_embeddings = []
    noun_embeddings = []
    verb_embeddings = []
    
    for token in document_tokens:
        word = token['word']
        pos = token['pos']
        if word in embeddings and pos in ['ADJ']:
            adj_embeddings.append(embeddings[word])
        elif word in embeddings and pos in ['NOUN']:
            noun_embeddings.append(embeddings[word])
        elif word in embeddings and pos in ['VERB']:
            verb_embeddings.append(embeddings[word])
    
    if len(adj_embeddings) == 0:
        mean_adj_embeddings = np.zeros(embeddings_dim)
    else:
        mean_adj_embeddings = compute_embeddings_mean(adj_embeddings)
        
    if len(noun_embeddings) == 0:
        mean_noun_embeddings = np.zeros(embeddings_dim)
    else:
        mean_noun_embeddings = compute_embeddings_mean(noun_embeddings)
        
    if len(verb_embeddings) == 0:
        mean_verb_embeddings = np.zeros(embeddings_dim)
    else:
        mean_verb_embeddings = compute_embeddings_mean(verb_embeddings)  
    
    
    mean_document_embeddings = np.concatenate([mean_adj_embeddings, mean_noun_embeddings, mean_verb_embeddings], axis=None)
    return mean_document_embeddings

In [55]:
def extract_features(documents):
    dataset_features = []
    for document_tokens in documents:
        document_embeddings = compute_all_embeddings_mean(document_tokens)
        # document_embeddings = compute_filtered_embeddings_mean(document_tokens)
        # document_embeddings = compute_filtered_embeddings_sep_means(document_tokens)
        dataset_features.append(document_embeddings)
    return dataset_features

In [56]:
all_features = extract_features(all_documents)


In [57]:
len(all_features), len(all_features[0])


(274, 64)

In [58]:
all_labels = create_label_list(all_documents_paths)


In [59]:
def train_test_split(all_features, all_labels, all_documents_paths):
    train_features, train_labels = [], []
    test_features, test_labels = [], []
    
    for features, label,  document_path in zip(all_features, all_labels, all_documents_paths):
        if 'training' in document_path:
            train_features.append(features)
            train_labels.append(label)
        else:
            test_features.append(features)
            test_labels.append(label)
    return train_features, train_labels, test_features, test_labels


In [60]:
train_features, train_labels, test_features, test_labels = train_test_split(all_features, all_labels, all_documents_paths)
len(train_features), len(train_labels), len(test_features), len(test_labels)

(200, 200, 74, 74)

In [61]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# X_train = np.stack(train_features, axis=0)
X_train = scaler.fit_transform(train_features)

In [62]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

# Define the parameter grid to search through
param_grid = {
    'C': [0.1, 0.01, 0.001],  # Regularization parameter
    'dual': [True, False]
}

# Initialize SVM with linear kernel
svc = LinearSVC()

# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, refit=True)

# Fit the grid search to your data
grid_search.fit(X_train, train_labels)

# Get mean test scores across folds
mean_test_scores = grid_search.cv_results_['mean_test_score']

# Print the best parameters and best score
print("Best parameters found:", grid_search.best_params_)
print("Best score found:", grid_search.best_score_)

Best parameters found: {'C': 0.1, 'dual': True}
Best score found: 0.6699999999999999


In [63]:
X_test = scaler.transform(test_features)
# Get the best estimator (model) found by grid search
best_model = grid_search.best_estimator_

# Now, you can use this best_model to make predictions on new data
# For example, if you have new data X_new, you can predict its labels as follows:
predictions = best_model.predict(X_test)

In [64]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
test_predictions = predictions
print(classification_report(test_labels, test_predictions, zero_division=0)) # output_dict=True

              precision    recall  f1-score   support

           F       0.79      0.70      0.74        37
           M       0.73      0.81      0.77        37

    accuracy                           0.76        74
   macro avg       0.76      0.76      0.76        74
weighted avg       0.76      0.76      0.76        74

