In [1]:
import sqlite3
import os
import sys

dim = 32   #32 o 64 o 128
dir = 'itwac'

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

sql_path = f'{parent_dir}/{dir}/itwac{dim}.sqlite'
txt_path = f'{parent_dir}/{dir}/itwac{dim}.txt'
conllu_dir = 'data/profiling_output/11462'

In [3]:
con = sqlite3.connect(sql_path)

In [4]:
cur = con.cursor()

In [5]:
with open(txt_path, 'w+') as out_file:
    for embedding in cur.execute("SELECT * FROM store"):
        str_embedding = [str(el) for el in embedding[:-1]]
        out_file.write('\t'.join(str_embedding)+'\n')

out_file.close()

In [2]:
embeddings_dim = dim
embeddings_path = f'itwac/itwac{embeddings_dim}.txt'

In [3]:
sys.path.append('..')  # Add parent directory to the Python path
from utils.helpers import *

In [4]:
embeddings = load_word_embeddings(parent_dir+"/"+embeddings_path) 

In [5]:
embeddings['vedersi']

array([ 0.10949107, -0.12286978,  0.11641154,  0.17017075,  0.24658015,
       -0.18262769,  0.30469814,  0.21135296,  0.1471802 , -0.01843514,
       -0.12676404,  0.05465349, -0.35320389,  0.02981323,  0.2520377 ,
        0.06308851, -0.0790691 , -0.13654596,  0.03764106, -0.11046565,
        0.08085832,  0.21672083, -0.07570078, -0.26205689, -0.11601808,
        0.38162684,  0.24362694, -0.04126684,  0.08113471, -0.04481747,
       -0.22317572,  0.13983752])

In [6]:
import os
all_documents_paths = []
for file_name in os.listdir(conllu_dir):
    file_path = os.path.join(conllu_dir, file_name)
    all_documents_paths.append(file_path)

In [7]:
all_documents = []

for document_path in all_documents_paths:
    document_tokens = get_tokens_from_file(document_path, postg=True)
    all_documents.append(document_tokens)

In [8]:
all_documents

[[{'word': 'Wow', 'pos': 'PROPN'},
  {'word': 'io', 'pos': 'PRON'},
  {'word': 'mi', 'pos': 'PRON'},
  {'word': 'chiamo', 'pos': 'VERB'},
  {'word': 'Giada', 'pos': 'PROPN'}],
 [{'word': 'Non', 'pos': 'ADV'},
  {'word': 'per', 'pos': 'ADP'},
  {'word': 'insultare', 'pos': 'VERB'},
  {'word': 'ma', 'pos': 'CCONJ'},
  {'word': 'quando', 'pos': 'SCONJ'},
  {'word': 'gridava', 'pos': 'VERB'},
  {'word': 'sembrava', 'pos': 'VERB'},
  {'word': 'una', 'pos': 'DET'},
  {'word': 'cornacchia', 'pos': 'NOUN'}],
 [{'word': 'Commuovente', 'pos': 'ADV'},
  {'word': '!!!', 'pos': 'PUNCT'},
  {'word': '!!', 'pos': 'PUNCT'}],
 [{'word': 'Renzi', 'pos': 'ADV'},
  {'word': 'non', 'pos': 'ADV'},
  {'word': 'sei', 'pos': 'AUX'},
  {'word': 'collocabile', 'pos': 'ADJ'},
  {'word': 'sulla', 'pos': '_'},
  {'word': 'scala', 'pos': 'NOUN'},
  {'word': 'degli', 'pos': '_'},
  {'word': 'esseri', 'pos': 'NOUN'},
  {'word': 'viventi', 'pos': 'ADJ'}],
 [{'word': 'Renzi', 'pos': 'ADV'},
  {'word': 'fai', 'pos': 'VER

In [122]:
def compute_embeddings_mean(document_embeddings):
    sum_array = np.sum(document_embeddings, axis=0)
    mean_array = np.divide(sum_array, len(document_embeddings))
    return mean_array

def compute_all_embeddings_mean(document_tokens):
    document_embeddings = []
    
    for token in document_tokens:
        word = token['word']
        if word in embeddings:
            document_embeddings.append(embeddings[word])
    
    if len(document_embeddings) == 0:
        mean_document_embeddings = np.zeros(embeddings_dim)
    else:
        mean_document_embeddings = compute_embeddings_mean(document_embeddings)
    return mean_document_embeddings

def compute_filtered_embeddings_mean(document_tokens):
    document_embeddings = []
    
    for token in document_tokens:
        word = token['word']
        pos = token['pos']
        if word in embeddings and pos in ['ADJ', 'NOUN', 'VERB', 'PREP']:
            document_embeddings.append(embeddings[word])
    
    if len(document_embeddings) == 0:
        mean_document_embeddings = np.zeros(embeddings_dim)
    else:
        mean_document_embeddings = compute_embeddings_mean(document_embeddings)
    return mean_document_embeddings


def compute_filtered_embeddings_sep_means(document_tokens):
    adj_embeddings = []
    noun_embeddings = []
    verb_embeddings = []
    prep_embeddings = []
    
    for token in document_tokens:
        word = token['word']
        pos = token['pos']
        if word in embeddings and pos in ['ADJ']:
            adj_embeddings.append(embeddings[word])
        elif word in embeddings and pos in ['NOUN']:
            noun_embeddings.append(embeddings[word])
        elif word in embeddings and pos in ['VERB']:
            verb_embeddings.append(embeddings[word])
        elif word in embeddings and pos in ['PREP']:
            prep_embeddings.append(embeddings[word])
    
    if len(adj_embeddings) == 0:
        mean_adj_embeddings = np.zeros(embeddings_dim)
    else:
        mean_adj_embeddings = compute_embeddings_mean(adj_embeddings)
        
    if len(noun_embeddings) == 0:
        mean_noun_embeddings = np.zeros(embeddings_dim)
    else:
        mean_noun_embeddings = compute_embeddings_mean(noun_embeddings)
        
    if len(verb_embeddings) == 0:
        mean_verb_embeddings = np.zeros(embeddings_dim)
    else:
        mean_verb_embeddings = compute_embeddings_mean(verb_embeddings)

    if len(prep_embeddings) == 0:
        mean_prep_embeddings = np.zeros(embeddings_dim)
    else:
        mean_prep_embeddings = compute_embeddings_mean(prep_embeddings) 
    
    
    mean_document_embeddings = np.concatenate([mean_adj_embeddings, mean_noun_embeddings, mean_verb_embeddings, mean_prep_embeddings], axis=None)
    return mean_document_embeddings

In [135]:
def extract_features(documents):
    dataset_features = []
    for document_tokens in documents:
        #document_embeddings = compute_all_embeddings_mean(document_tokens)
        #document_embeddings = compute_filtered_embeddings_mean(document_tokens)
        document_embeddings = compute_filtered_embeddings_sep_means(document_tokens)
        dataset_features.append(document_embeddings)
    return dataset_features

In [136]:
all_features = extract_features(all_documents)


In [137]:
len(all_features), len(all_features[0])


(8795, 256)

In [138]:
all_labels = create_label_list(all_documents_paths)


In [139]:
all_labels[:10]


['F', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'F']

In [140]:
def train_test_split(all_features, all_labels, all_documents_paths):
    train_features, train_labels = [], []
    test_features, test_labels = [], []
    
    for features, label,  document_path in zip(all_features, all_labels, all_documents_paths):
        if 'training' in document_path:
            train_features.append(features)
            train_labels.append(label)
        else:
            test_features.append(features)
            test_labels.append(label)
    return train_features, train_labels, test_features, test_labels


In [141]:
train_features, train_labels, test_features, test_labels = train_test_split(all_features, all_labels, all_documents_paths)
len(train_features), len(train_labels), len(test_features), len(test_labels)

(4395, 4395, 4400, 4400)

In [142]:
len(train_labels)

4395

In [143]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# X_train = np.stack(train_features, axis=0)
X_train = scaler.fit_transform(train_features)

In [144]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

# Define the parameter grid to search through
param_grid = {
    'C': [0.1, 0.01, 0.001],  # Regularization parameter
    'dual': [True, False]
}

# Initialize SVM with linear kernel
svc = LinearSVC()

# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, refit=True)

# Fit the grid search to your data
grid_search.fit(X_train, train_labels)

# Get mean test scores across folds
mean_test_scores = grid_search.cv_results_['mean_test_score']

# Print the best parameters and best score
print("Best parameters found:", grid_search.best_params_)
print("Best score found:", grid_search.best_score_)

Best parameters found: {'C': 0.1, 'dual': True}
Best score found: 0.5688282138794085


In [145]:
X_test = scaler.transform(test_features)
# Get the best estimator (model) found by grid search
best_model = grid_search.best_estimator_

# Now, you can use this best_model to make predictions on new data
# For example, if you have new data X_new, you can predict its labels as follows:
predictions = best_model.predict(X_test)

In [146]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
test_predictions = predictions
print(classification_report(test_labels, test_predictions, zero_division=0)) # output_dict=True

              precision    recall  f1-score   support

           F       0.52      0.78      0.62      2200
           M       0.55      0.27      0.36      2200

    accuracy                           0.52      4400
   macro avg       0.53      0.52      0.49      4400
weighted avg       0.53      0.52      0.49      4400

