#  PREPARAZIONE DEL DATASET

**IMPORT**

In [8]:
import os, time, re
import pandas as pd
from dotenv import load_dotenv
from elasticsearch import Elasticsearch, helpers # type: ignore

In [9]:
def safe_filename(name):
    """Rimuove caratteri non validi da un nome di file."""
    # Rimuove / \ ? % * : | " < > e sostituisce spazi multipli
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

In [10]:
# Dataset
df = pd.read_csv("Dataset/Movies.csv")

# Subsampling del Dataset
df = df.sample(n=2976, random_state=42, replace=False)
df = df[['Title', 'Plot']].copy()
df = df.reset_index(drop=True)
 
# Crea la cartella per i .txt
directory = "Files"
os.makedirs(directory, exist_ok=True)

# Crea file di testo per ogni film:
# nome file basato sul Title e contenuto su Plot
for idx, row in df.iterrows():
    movieTitle = safe_filename(row["Title"])
    moviePlot = row["Plot"]
    
    filePath = os.path.join(directory, f"{movieTitle}.txt")

    with open(filePath, "w", encoding="utf-8") as f:
        f.write(str(moviePlot))

# ELASTICSEARCH

In [11]:
# Carica variabili d'ambiente dal file .env
load_dotenv(dotenv_path="EnvAndDocker/.env")

# Recupera i valori dal .env
URL = os.getenv("URL")
PASSWORD = os.getenv("PASSWORD")

# Connessione a Elasticsearch
es = Elasticsearch(URL, basic_auth=("elastic", PASSWORD))

# Definizione del mapping e degli analyzer
MAPPING = {
    "mappings": {
        "properties": {
            "title": {"type": "text", "analyzer": "english", "search_analyzer": "english"},
            "content": {"type": "text", "analyzer": "english", "search_analyzer": "english"}
        }
    }
}


def index():
    """Crea l'indice moviesindex e indicizza i file .txt."""

    # Elimina indice se esiste già
    if es.indices.exists(index="moviesindex"):
        es.indices.delete(index="moviesindex")
        print(f"Indice esistente 'moviesindex' eliminato.")

    # Creazione nuovo indice con il mapping definito
    es.indices.create(index="moviesindex", body=MAPPING)
    print(f"Indice 'moviesindex' creato correttamente.")

    # Lista di documenti da indicizzare
    actions = []
    for filename in os.listdir("Files"):
        if not filename.endswith(".txt"):
            continue
        path = os.path.join("Files", filename)
        with open(path, "r", encoding="utf-8") as f:
            content = f.read()

        actions.append({
            "_index": "moviesindex",
            "_source": {
                "title": os.path.splitext(filename)[0],  # nome del file senza estensione
                "content": content
            }
        })

    if not actions:
        print("Nessun file .txt trovato nella cartella.")
        return

    # Indicizzazione in blocco
    start = time.perf_counter()
    success, _ = helpers.bulk(es.options(request_timeout=120), actions)
    elapsed = time.perf_counter() - start

    print(f"Documenti indicizzati correttamente: {success}")
    print(f"Tempo totale: {elapsed:.2f} secondi")

    return elapsed


if __name__ == "__main__":
    index()


Indice 'moviesindex' creato correttamente.
Documenti indicizzati correttamente: 2976
Tempo totale: 3.87 secondi


**Interrogazione dell’indice Elasticsearch**


In [12]:
def search(query):
    res = es.search(index="moviesindex", body=query)
    print('Query results:')
    for i, hit in enumerate(res["hits"]["hits"]):
        res_doc = hit["_source"]
        print(f'{i}:{res_doc["title"]}({hit["_score"]})')

In [13]:
query = input('Inserisci la tua ricerca (es. title: Inception oppure content: "sogno dentro un sogno"): ').strip()

if ":" not in query:
    print("Formato non valido. Usa: title:<termine> oppure content:<termine>")
else:
    field, text = [part.strip() for part in query.split(":", 1)]

    if field not in ("title", "content"):
        print("Campo non valido. Usa 'title' o 'content'.")
    else:
        is_phrase = text.startswith('"') and text.endswith('"')
        text = text.strip('"') if is_phrase else text

        query_body = {
            "query": {
                "match_phrase" if is_phrase else "match": {field: text}
            }
        }

        search(query_body)


Query results:
0:Paprika(5.6171293)
1:Banshi(5.1042423)
2:Days of Our Own(5.03267)
3:The Tramp(4.9752154)
4:A Nightmare on Elm Street 3 Dream Warriors(4.9207525)
5:Forever(4.9190583)
6:Carefree(4.909159)
7:Naattiya Rani(4.8041472)
8:Usha Parinayam(4.8009953)
9:The 5,000 Fingers of Dr. T(4.734557)
