In [None]:
import re
import nltk
import spacy
from jupyter_core.version import pattern
from nltk.corpus import stopwords
from collections import Counter
from spacy.lang.en.stop_words import contractions
import random
import pandas as pd
import language_tool_python
import pprint

In [None]:
filename_all_data_dict = "./Files/cleaned_dataset.csv"

cleaned_data_df = pd.read_csv(filename_all_data_dict, names = ['file', 'text'], header = None)
cleaned_data_df = cleaned_data_df.drop(index = 0)
cleaned_data_df

In [None]:
# Create a list with all the values in the column 'text'
text_list_preprocessed = cleaned_data_df['text'].tolist()
file_list = cleaned_data_df['file'].tolist()

In [None]:
pprint.pprint(text_list_preprocessed[0])

In [None]:
def remove_use_case(text):
    # Define the regex pattern to match
    pattern = r'(use case seu).*'
    
    # Remove all matches of the pattern
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text

for i in range(len(text_list_preprocessed)):
    text_list_preprocessed[i] = remove_use_case(text_list_preprocessed[i])


In [None]:
# Initialize an empty dictionary
contraction_dictionary = {}

# Read the contents of the file
with open('italian_contractions.txt', 'r', encoding='utf-8') as file:
    # Read each line in the file
    for line in file:
        # Strip any leading/trailing whitespace and trailing commas
        line = line.strip().rstrip(',')
        
        # Split the line into key and value based on the colon
        if ':' in line:
            key, value = line.split(':', 1)  # Split only at the first colon
            # Remove extra quotes and whitespace
            
            key = key.strip().strip('"')
            value = value.strip().strip('"')
            
            # Add to the dictionary
            contraction_dictionary[key] = value

# Print the resulting dictionary
print(len(contraction_dictionary),contraction_dictionary)

In [None]:
# Function to replace contractions
def expand_contractions(text, contraction_dict):
    for contraction, expansion in contraction_dict.items():
        # Use word boundaries to find the contraction as a whole word
        text = re.sub(rf'\b{re.escape(contraction)}\b', expansion, text)
    return text

# Iterate through each document in text_list and replace contractions
expanded_texts = [expand_contractions(text, contraction_dictionary) for text in text_list_preprocessed]

In [None]:
def normalize_whitespace(text):
    """
    Reduces multiple consecutive whitespace characters to a single space.

    Args:
    - text (str): The input text with excessive whitespace.

    Returns:
    - str: The text with reduced whitespace.
    """
    # Replace one or more whitespace characters with a single space
    return re.sub(r'\s+', ' ', text).strip()

In [None]:
def remove_any_special_characters(text):
    # Do not remove the special characters of italian language, such as ù in più
    regex = r"[^a-zA-Z0-9\s]"
    regex = r"[^0-9a-zA-ZàèéìòùÀÈÉÌÒÙ\s]"
    
    cleaned_text = re.sub(regex, '', text)
    
    return cleaned_text

for i in range(len(expanded_texts)):
    expanded_texts[i] = remove_any_special_characters(expanded_texts[i])
    expanded_texts[i] = normalize_whitespace(expanded_texts[i])
    
pprint.pprint(expanded_texts[0])


In [None]:
import spacy
from spellchecker import SpellChecker
from tqdm import tqdm

# Load the spaCy Italian model
nlp = spacy.load('it_core_news_sm')

# Initialize the spell checker for both Italian and English languages
spell_it = SpellChecker(language='it')
spell_en = SpellChecker(language='en')
spell_en.word_frequency.add('panthera')

correction_dict = {}
suggestions_dict = {}

# Function to check if the word is correct in either Italian or English
def is_misspelled(word):
    # If the word is misspelled in both Italian and English, it's considered incorrect
    return word in spell_it.unknown([word]) and word in spell_en.unknown([word])

# Process each text in the list
for i, text in enumerate(tqdm(expanded_texts, desc="Checking text")):
    # Use spaCy to process the text (assuming no punctuation)
    doc = nlp(text)
    
    # Tokenize the text into words (tokens)
    words = [token.text for token in doc]
    
    # Identify misspelled words that are incorrect in both Italian and English
    misspelled = [word for word in words if is_misspelled(word)]

    correction_dict[i] = {}  # Nested dictionary for corrections
    suggestions_dict[i] = {}  # Nested dictionary for suggestions
    
    # Correct the misspelled words
    for word in misspelled:
        # Get the most likely correction from the Italian dictionary
        correction = spell_it.correction(word)
        correction_dict[i][word] = correction
        
        # Get other suggestions (optional)
        suggestions= spell_it.candidates(word)
        suggestions_dict[i][word] = suggestions  # Combine suggestions from both dictionaries
    


In [None]:
def expand_suggestions(text, dictionary):
    # Itera sulle coppie chiave-valore del dizionario
    for _, suggestions in dictionary.items():
        # Itera sulle parole e i loro suggerimenti nel sotto-dizionario
        for word, suggestion in suggestions.items():
            if (suggestion is not None) and len(suggestion) == 1 and len(word) > 3:    
                sugg = str(suggestion).strip("{}'")  
                text = text.replace(word, sugg)    
    return text

# Esegui la funzione di espansione sulle prime due stringhe preprocessate
cleaned_expanded_texts = [expand_suggestions(text, suggestions_dict) for text in expanded_texts]
pprint.pprint(cleaned_expanded_texts[0])

In [None]:
def modify_apostrophe(text):
    pattern = r"\'"
    
    cleaned_text = re.sub(pattern, '’', text, flags=re.DOTALL)
    
    return cleaned_text

In [None]:
# Function to lowercase text and remove punctuation
def remove_punctuation(text):
    # Remove punctuation
    import string
    # Regex to identify any punctuation in the text
    regex = '[' + string.punctuation + ']' #searching for a match with any of the characters inside the square brackets
    result = re.sub(regex,' ',text)
    
    return result

# Apply lowercase and punctuation removal to each cleaned text
cleaned_text_list_to_save = [normalize_whitespace(remove_punctuation(modify_apostrophe(text))) for text in cleaned_expanded_texts]

In [None]:
# Function to replace contractions
def expand_contractions(text, contraction_dict):
    for contraction, expansion in contraction_dict.items():
        # Use word boundaries to find the contraction as a whole word
        text = re.sub(rf'\b{re.escape(contraction)}\b', expansion, text)
    return text

# Iterate through each document in text_list and replace contractions
cleaned_text_list_to_save = [expand_contractions(text, contraction_dictionary) for text in cleaned_text_list_to_save]
pprint.pprint(cleaned_text_list_to_save[0])

In [None]:
# Before removing any special characters try first to solve contractions
# Find all the contractions in a text
def find_words_with_apostrophe(text):
    """
    This function finds all the words in the provided text that contain the contraction "l'".

    Parameters:
    text (str): The input text from which to extract words.

    Returns:
    list: A list of words containing "l'".
    """
    # Define a regex pattern to match words containing "'"
    pattern =r"\b\w*’\w*\b|\b\w*'\w*\b"
    
    # Use re.findall to get all matches
    matches = re.findall(pattern, text)
    
    return matches

# Example usage
words_with_apostr = []
for text in cleaned_text_list_to_save:
    words_with_apostr.extend(find_words_with_apostrophe(text))  # Use extend instead of append

# Convert to a set to get unique words
unique_words_with_apostr = set(words_with_apostr)

print(unique_words_with_apostr)

In [None]:
print(len(unique_words_with_apostr))
pprint.pprint(unique_words_with_apostr)

In [None]:
# Create a DataFrame with 'file' and 'text' columns
df = pd.DataFrame(columns=['file', 'text'])

# Gradually add the data to the DataFrame
for i in range(len(cleaned_text_list_to_save)):
    df.loc[i] = [file_list[i], cleaned_text_list_to_save[i]]
    
# Save the DataFrame to a CSV file
df.to_csv('./Files/final_dataset.csv', index=False)

# Output the DataFrame to verify
print(df)