In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [2]:
RAW_PATH = "../data/raw/"
CLEANED_PATH = "../data/cleaned/"

df = pd.read_csv(os.path.join(RAW_PATH, 'captions.txt'))


## Dataset Format


In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

## Treating nulls and duplicates

In [6]:
df.dropna(inplace=True)

In [7]:
df.drop_duplicates(inplace=True)

## Captioning cleaning

In [8]:
def clean_caption(text):
    # Remover pontuação e aspas
    table = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    text = " ".join(text.translate(table).split()).lower()
    # Remover palavras com números e espaços extras
    words = [word for word in text.split() if word.isalpha()]
    return " ".join(words)

In [None]:
# 2. Aplicar a limpeza
df['caption_clean'] = df['caption'].apply(clean_caption)

# 3. Juntar Legendas e Imagens (Criar o caminho completo)
# Verifique se sua pasta se chama 'data' ou 'flickr8k_dataset'
image_path = os.path.join(RAW_PATH, "Images")
df['image_path'] = df['image'].apply(lambda x: os.path.join(image_path, x))

# 4. Limpar Dados (Remover duplicatas e Nulos conforme solicitado)
df.dropna(subset=['caption_clean'], inplace=True)
df.drop_duplicates(inplace=True)

print("Estrutura atualizada com sucesso!")
print(df[['image', 'caption_clean', 'image_path']].head())

## Stats


In [None]:
df.columns

In [None]:

def plot_stat_hist(data, ax, title, color):

    mu, sigma = data.mean(), data.std()

    sns.histplot(data, bins=30, ax=ax, kde=True, color=color, alpha=0.6)

    # Stats lines
    ax.axvline(mu, color='black', linestyle='--', lw=2)
    ax.axvline(mu + sigma, color='black', linestyle=':', lw=1)
    ax.axvline(mu - sigma, color='black', linestyle=':', lw=1)

    # Values
    ax.set_title(f"{title}\n mean={mu:.2f} | std={sigma:.2f}", fontsize=12, pad=10)
    ax.set_xlabel("")

# --- Preparing the data ---
# Df to plot
df_stats = {
    "words_old": df['caption'].str.split().str.len(),
    "words_new": df['caption_clean'].str.split().str.len(),
    "len_old": df['caption'].str.len(),
    "len_new": df['caption_clean'].str.len()
}

# grid
fig, axes = plt.subplots(2, 2, figsize=(14, 10), sharey='row')
fig.suptitle("Análise Comparativa: Antes vs Depois do Cleaning", fontsize=16, fontweight='bold')

# original
plot_stat_hist(df_stats["words_old"], axes[0, 0], "Qtd wods (Original)", "gray")
plot_stat_hist(df_stats["len_old"], axes[0, 1], "Len String (Original)", "gray")

# clean
plot_stat_hist(df_stats["words_new"], axes[1, 0], "Qtd wods (Clean)", "blue")
plot_stat_hist(df_stats["len_new"], axes[1, 1], "Len String (Clean)", "salmon")


axes[1, 0].set_xlabel("Number of Words")
axes[1, 1].set_xlabel("Characters Count")
axes[0, 0].set_ylabel("Samples")
axes[1, 0].set_ylabel("Samples")

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

## Cleaning size of the captions

In [None]:
words_min = df_stats['words_new'].min()
words_max = df_stats['words_new'].max()

print(f"Number of words : [{words_min},{words_max}]")

threshold_size_caption = words_min #Captions with size 1 are "A", noise

df[df_stats['words_new'] == threshold_size_caption].head()


In [13]:
#Clean both captions
df = df[df_stats['words_new'] > threshold_size_caption]

## Most frequent words

In [None]:
words_df = df['caption_clean'].str.split()
# Catch  the 20 most unused
words_and_instances_df = words_df.explode().value_counts()
top_20 = words_and_instances_df.head(20)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_20.values, y=top_20.index, palette='viridis')

plt.title('Top 20 most frequently word')
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.show()

In [None]:
number_of_single_instances_words = len(words_and_instances_df[words_and_instances_df == 1])
number_of_double_instances_words = len(words_and_instances_df[words_and_instances_df == 2])
number_of_triple_instances_words = len(words_and_instances_df[words_and_instances_df == 3])
print(f"TOTAL NUMBER OF WORDS: {len(words_and_instances_df)}"
      f"\nWORDS WITH A SINGLE INSTANCE: {number_of_single_instances_words}",
      f"\nWORDS WITH DOUBLE INSTANCES: {number_of_double_instances_words}",
      f"\nWORDS WITH TRIPLE INSTANCES: {number_of_triple_instances_words}")


## Test


In [None]:
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

# Selecionar 3 amostras aleatórias
samples = df.sample(3)

plt.figure(figsize=(15, 10))

for i, (index, row) in enumerate(samples.iterrows()):
    plt.subplot(1, 3, i+1)

    # Abrir a imagem usando o caminho que criamos
    img = Image.open(row['image_path'])
    plt.imshow(img)

    # Adicionar a legenda limpa (com quebra de linha para não cortar)
    title = row['caption_clean']
    wrapped_title = "\n".join([title[i:i+30] for i in range(0, len(title), 30)])
    plt.title(wrapped_title, fontsize=12)
    plt.axis('off')

plt.tight_layout()
plt.show()

## Saving


In [None]:
clean_file_path = os.path.join(CLEANED_PATH,'flickr8k_cleaned_data.csv')
df.to_csv(clean_file_path, index=False)

print("Arquivo 'flickr8k_cleaned_data.csv' salvo com sucesso!")

In [None]:
check_df = pd.read_csv(clean_file_path)
print("O arquivo salvo tem", len(check_df), "linhas.")
check_df.head(2)