# Corpus File Preprocessing for Word2Vec:
## 1. Load and read the corpus.
## 2. Tokenize the text.
## 3. Lowercase all words.
## 4. Remove stop words and non-alphabetic words.
## 5. (Optional) Lemmatize words.
## 6. Optionally, subsample frequent words.
## 7. Prepare the corpus for Word2Vec (list of tokenized sentences).
## 8. Train the Word2Vec model using Gensim or other frameworks.

In [None]:
import gensim
from gensim.models import Word2Vec

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
#nltk.download('stopwords')
#nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import string

import spacy

import pandas as pd

In [None]:
with open(r"/home/acarugat/MyRepo1/Testi/Manzoni/Corpus.txt", "r", encoding="utf-8") as file:
    corpus = file.read()
corpus=sent_tokenize(corpus, language='italian')
print (corpus[1:2])

In [None]:
tokenized_corpus = [word_tokenize(sentence.lower(), language='italian') for sentence in corpus]

punteggiatura = string.punctuation+"«»’"

print (punteggiatura)

tokenized_corpus2=[]
for sentence in tokenized_corpus:
    sentence2=[]
    for word in sentence:
        if word not in punteggiatura:
            sentence2.append(word)
    tokenized_corpus2.append(sentence2)
    
tokenized_corpus=tokenized_corpus2
print (tokenized_corpus[0:10])      
    
#okenized_corpus = [word for word in tokenized_corpus if word not in string.punctuation]

In [None]:
stop_words = set(stopwords.words('italian'))
cleaned_corpus = [[word for word in sentence if word not in stop_words] for sentence in tokenized_corpus]
print (cleaned_corpus[0:4])

In [None]:
#cleaned_corpus = [[word for word in sentence if word.isalpha()] for sentence in cleaned_corpus]
#print (cleaned_corpus[0:4])

In [None]:
#lemmatizer = WordNetLemmatizer()

#lemmatized_corpus = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in cleaned_corpus]
#print (lemmatized_corpus[0:100])

In [None]:
# Load the Italian model
#nlp = spacy.load("it_core_news_sm")
#lemmatized_corpus = []

#for sentence in cleaned_corpus:
    # Process each sentence with spaCy NLP pipeline
#    doc = nlp(" ".join(sentence))
    # Extract lemmatized words
#    lemmatized_sentence = [token.lemma_ for token in doc]
#    lemmatized_corpus.append(lemmatized_sentence)

In [None]:
#print ("Original Corpus = ", corpus)
#print ("Tokenized Corpus = ", tokenized_corpus)
#print ("Cleaned Corpus = ", cleaned_corpus)
#print ("Lemmatized Corpus = ", lemmatized_corpus[0:99])

In [None]:
with open(r"/home/acarugat/MyRepo1/Testi/Manzoni/Corpus-PP.txt", "w", encoding="utf-8") as file:
    for sentence in cleaned_corpus:
        file.write(" ".join(sentence) + "\n")

In [None]:
# Train the Word2Vec model
model = Word2Vec(sentences=cleaned_corpus, vector_size=100, window=5, min_count=5, workers=4)

# Save the trained model
model.save("word2vec_model.model")

In [None]:
print (model.wv.get_vecattr("lago", "count"))
print (model.wv.most_similar("lago"))
print (model.wv.similarity("lago", "fiume"))

In [None]:
# Train the Word2Vec model with more epochs
print (cleaned_corpus[10:100])
model = Word2Vec(sentences=cleaned_corpus, vector_size=100, window=5, min_count=5, workers=4, epochs=100, sg=0)

In [None]:
print (model.wv.get_vecattr("lago", "count"))
print (model.wv.most_similar("lago"))
print (model.wv.similarity("lago", "fiume"))

In [None]:
# Train the Word2Vec model with more epochs and Skip-Grammar
model = Word2Vec(sentences=cleaned_corpus, vector_size=100, window=5, min_count=5, workers=4, epochs=100, sg=1)

In [None]:
print (model.wv.get_vecattr("lago", "count"))
print (model.wv.most_similar("lago"))
print (model.wv.similarity("lago", "fiume"))

In [None]:
from itertools import chain

# Flatten the list
flattened_corpus = list(chain.from_iterable(cleaned_corpus))

# Get unique tokens
parole_presenti = list(set(flattened_corpus))

In [None]:
#In Gensim's Word2Vec, the vocabulary already contains unique words
tokens_unici= list(model.wv.index_to_key)
print (tokens_unici[1:10])
    

In [None]:
tabella = []
for parola in tokens_unici:
    print (parola)
    c=model.wv.get_vecattr(parola,"count")
    tabella.append({"Parola":parola, "Frequenza":c})
df=pd.DataFrame(tabella)

In [None]:
freq_dist=nltk.FreqDist(flattened_corpus)
freq_dist.plot(10, cumulative=False)


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

def calcola_similarita(a, b): 
    tfidf = vectorizer.fit_transform([a, b])
    return ((tfidf * tfidf.T).toarray())[0,1]

file1_path = r"/home/acarugat/MyRepo1/Testi/Manzoni/Fermo-e-Lucia.txt"

with open(file1_path, 'r', encoding='utf-8') as file: 
    testo1 = file.read()

file2_path = r"/home/acarugat/MyRepo1/Testi/Manzoni/PromessiSposi-1840.txt"

with open(file2_path, 'r', encoding='utf-8') as file: 
    testo2 = file.read()

s=calcola_similarita(testo1,testo2)

print (s)


0.9785557014149964


In [None]:
import os
import pandas as pd
import numpy as np

# Initialize lists to store filenames and contents
filenames = []
contents = []

# Walk through the home directory and read all text files
print ("Dimmi che documenti vuoi analizzare: Manzoni Presidenti Giornali o Parlamento")
line = input()

for root, dirs, files in os.walk(r"/home/acarugat/MyRepo1/Testi/"+line):
    for file in files:
        if file not in ["Corpus.txt", "Corpus-PP.txt", "PromessiSposi-stopword.txt", "MattarellaFine2024-preprocessato.txt"]:
            print (file)
            if file.endswith(".txt"):  # Check if the file is a text file
                filepath = os.path.join(root, file)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        filenames.append(filepath)  # Store the file path
                        contents.append(f.read())  # Store the file content
                except Exception as e:
                    print(f"Error reading {filepath}: {e}")

# Create the dataset (DataFrame)
dataset = pd.DataFrame({
    "filename": filenames,
    "content": contents
})

# Display the dataset
# print(dataset)

M = np.zeros((dataset.shape[0], dataset.shape[0])) # creiamo una matrice 30x30 per contenere i risultati di testo_i con testo_j


In [None]:
from tqdm import tqdm

for i, row in tqdm(dataset.iterrows(), total=dataset.shape[0], desc='1st level'): # definiamo i
    for j, next_row in dataset.iterrows(): # definiamo j
        M[i, j] = calcola_similarita(row.content, next_row.content) # popoliamo la matrice con i risultati

In [None]:
print (M)
labels=dataset.filename.str.split('/').str[5:].str[1]
labels=labels.str.split('.').str[0]
print (labels)
similarity_df = pd.DataFrame(M, columns=labels, index=labels) # creiamo un dataframe

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

mask = np.triu(np.ones_like(similarity_df)) # applichiamo una maschera per rimuovere la parte superiore della heatmap

# creiamo la visualizzazione
plt.figure(figsize=(12, 12))
sns.heatmap(
			similarity_df,
			square=True, 
			annot=True, 
			robust=True,
			fmt='.2f',
			annot_kws={'size': 7, 'fontweight': 'bold'},
			yticklabels=similarity_df.columns,
			xticklabels=similarity_df.columns,
			cmap="YlGnBu",
            #mask=mask
			mask=None
)

plt.title('Heatmap delle similarità tra testi', fontdict={'fontsize': 24})
plt.show()