## Calcolo Feature Semantiche

In [20]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize
import string
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize
import string
import spacy
from textblob import TextBlob
import os

# Scarica le risorse di NLTK se non già presenti
# Setup iniziale
nlp = spacy.load("it_core_news_sm")
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('opinion_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\giand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\giand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\giand\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

In [22]:
# === Funzioni per l'estrazione delle feature linguistiche ===
def extract_features(text):
    tokens = word_tokenize(text.lower())
    tokens_nopunct = [t for t in tokens if t not in string.punctuation]
    num_words = len(tokens_nopunct)
    vocab = set(tokens_nopunct)
    num_unique_words = len(vocab)
    avg_word_len = np.mean([len(word) for word in tokens_nopunct]) if tokens_nopunct else 0
    all_caps = sum(1 for word in text.split() if word.isupper())
    subjectives = opinion_lexicon.positive() + opinion_lexicon.negative()
    subj_count = sum(1 for word in tokens_nopunct if word in subjectives)

    return pd.Series({
        "Num_Words": num_words,
        "Avg_Word_Len": avg_word_len,
        "Unique_Words": num_unique_words,
        "All_Caps_Count": all_caps,
        "Subjective_Words": subj_count
    })

def punctuation_features(text):
    return pd.Series({
        "Exclam_Count": text.count('!'),
        "Question_Count": text.count('?'),
        "Comma_Count": text.count(',')
    })

def pos_ratio_it(text):
    doc = nlp(text)
    num_tokens = len([token for token in doc if not token.is_punct and not token.is_space])
    nouns = sum(1 for token in doc if token.pos_ == "NOUN")
    verbs = sum(1 for token in doc if token.pos_ == "VERB")
    adjs  = sum(1 for token in doc if token.pos_ == "ADJ")

    return pd.Series({
        "Rapporto_Nomi": nouns / num_tokens if num_tokens else 0,
        "Rapporto_Verbi": verbs / num_tokens if num_tokens else 0,
        "Rapporto_Aggettivi": adjs / num_tokens if num_tokens else 0
    })

def sentiment_features(text):
    blob = TextBlob(text)
    return pd.Series({
        "Polarity": blob.sentiment.polarity,
        "Subjectivity": blob.sentiment.subjectivity
    })

# === Funzione principale per processare un dataset ===
def process_dataset(input_path, output_path):
    print(f"📥 Caricamento file: {input_path}")
    df = pd.read_csv(input_path, sep=';', encoding='utf-8')

    # TF-IDF (stesso vocabolario per entrambi i dataset)
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['Contenuto'])

    selector = SelectKBest(chi2, k=20)
    tfidf_selected = selector.fit_transform(tfidf_matrix, df['Label'])
    selected_features = selector.get_support(indices=True)
    selected_tfidf_df = pd.DataFrame(tfidf_selected.toarray(),
                                     columns=[f"TFIDF_{tfidf_vectorizer.get_feature_names_out()[i]}" for i in selected_features])

    # Estrazione feature linguistiche
    features_basic = df['Contenuto'].apply(extract_features)
    features_punct = df['Contenuto'].apply(punctuation_features)
    features_pos = df['Contenuto'].apply(pos_ratio_it)
    features_sent = df['Contenuto'].apply(sentiment_features)

    # Unione di tutte le feature
    final_df = pd.concat([
        df[['ID', 'Titolo', 'Contenuto', 'Label', 'Predicted']],
        selected_tfidf_df,
        features_basic,
        features_punct,
        features_pos,
        features_sent
    ], axis=1)

    # Salvataggio del file
    final_df.to_csv(output_path, sep=";", index=False, encoding="utf-8")
    print(f"✅ File salvato: {output_path}")

# === Esegui su due dataset ===
input_paths = [
    "Computer_Generate/test set contenente fake (generate) e real.csv",
    "Human_Written/test set contenente fake (reali) e real.csv"
]

output_paths = [
    "Test_GPT/Computer_test set contenente fake (generate) e real.csv",
    "Test_GPT/Human_test set contenente fake (generate) e real.csv"
]

for in_path, out_path in zip(input_paths, output_paths):
    process_dataset(in_path, out_path)

📥 Caricamento file: Computer_Generate/test set contenente fake (generate) e real.csv
✅ File salvato: Test_GPT/Computer_test set contenente fake (generate) e real.csv
📥 Caricamento file: Human_Written/test set contenente fake (reali) e real.csv
✅ File salvato: Test_GPT/Human_test set contenente fake (generate) e real.csv


In [23]:
pd.read_csv("Test_GPT/Human_test set contenente fake (generate) e real.csv", sep=";", encoding="utf-8")

Unnamed: 0,ID,Titolo,Contenuto,Label,Predicted,TFIDF_antenna,TFIDF_benissimo,TFIDF_casa,TFIDF_cassa,TFIDF_colore,...,All_Caps_Count,Subjective_Words,Exclam_Count,Question_Count,Comma_Count,Rapporto_Nomi,Rapporto_Verbi,Rapporto_Aggettivi,Polarity,Subjectivity
0,405,NOn compatibile con Fiat Punto Evo,la mia nuva kuga aveva come abbaglianti delle ...,0,0,0.0,0.0,0.000000,0.0,0.0,...,0.0,2.0,0,0,0,0.164179,0.164179,0.104478,0.000000,0.000000
1,5,Top di gamma,Decisamente il miglior robot aspirapolvere ma...,1,1,0.0,0.0,0.048162,0.0,0.0,...,1.0,3.0,0,0,0,0.231707,0.142276,0.081301,0.309524,0.595238
2,175,dopo 3/4 mesi non caricano più,ho acquistato per ben due volte queste cuffie ...,0,0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0,0,0,0.235294,0.105882,0.035294,-0.062500,0.237500
3,701,Strumento di installazione non incluso,ho scelto questo articolo perché la descrizion...,0,0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0,0,0,0.253521,0.169014,0.042254,-0.125000,0.375000
4,234,Riconsiderazione su Cuffie Bluetooth 5.1 Bassi...,salve.inizialmente sono andate molto bene. si ...,0,0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0,0,0,0.117647,0.196078,0.039216,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,274,Auricolare ottimo microfono mediocre,l'audio è molto buono cosi come l'isolamento d...,0,0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0,0,0,0.289474,0.105263,0.078947,0.000000,0.000000
96,511,Pessima confezione,confezione davvero scadente. daccordo che si t...,0,0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0,0,0,0.219512,0.146341,0.121951,0.000000,0.000000
97,11,Consigliato come regalo per i più piccoli,Semplice intuitivo e bello Un idea originale p...,1,1,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0,0,0,0.227723,0.099010,0.128713,-0.125000,0.375000
98,124,Pellicola specchio!!!,Ottima pellicola protettiva per proteggere lo ...,1,1,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,0,0,0,0.181818,0.200000,0.054545,0.000000,0.000000


In [16]:
pd.read_csv("Test_GPT/Computer_test set contenente fake (generate) e real.csv", sep=";", encoding="utf-8")

Unnamed: 0,ID,Titolo,Contenuto,Label,Predicted,TFIDF_acquistato,TFIDF_al,TFIDF_batteria,TFIDF_bluetooth,TFIDF_casa,...,TFIDF_suo,TFIDF_tutti,TFIDF_usare,TFIDF_usb,TFIDF_utile,Num_Words,Avg_Word_Len,Unique_Words,All_Caps_Count,Subjective_Words
0,5,Secondo me ottima,Consegna come al solito impeccabile. La cover ...,1,1,0.000000,0.389464,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,59.0,5.627119,48.0,0.0,0.0
1,113,Difetti strutturali,Difetti strutturali e la casa in cui una scato...,1,1,0.178761,0.000000,0.268034,0.185231,0.172938,...,0.0,0.0,0.136926,0.000000,0.000000,57.0,4.894737,48.0,1.0,1.0
2,51,Durano una settimana,tenute in tasca come quelle originali sono di...,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,35.0,4.857143,31.0,1.0,0.0
3,91,Dopo neanche un mese di utilizzo i due auricol...,ho comprato gli auricolari per fare un gradito...,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,105.0,5.400000,73.0,2.0,0.0
4,99,Male male,molto male sono deluso e non mi capita spesso...,0,0,0.000000,0.000000,0.000000,0.248000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,62.0,5.112903,52.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,73,Strumento di installazione non incluso,ho scelto questo articolo perchÃ© la descrizio...,0,0,0.000000,0.214303,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,72.0,5.138889,60.0,2.0,0.0
146,9,Non ce ne sono molte cosÃ¬ buone!,Sorprendentemente buona webcam per il prezzo a...,1,1,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,52.0,5.692308,48.0,0.0,0.0
147,54,Sempre rotti.,il prodotto non Ã¨ male ma dopo 2 settimane no...,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,36.0,4.722222,30.0,2.0,0.0
148,23,Utile e stabile,Utilissimo tre piedi per mantenere cellulare e...,1,1,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.201763,57.0,5.000000,45.0,1.0,0.0
