In [None]:
"""Starter Pipeline"""

# =====================
# 1. Setup environment
# =====================
!pip install pandas numpy scikit-learn matplotlib seaborn wordcloud\
             nltk spacy sentence-transformers \
             lightgbm shap transformers datasets gradio -q
!pip install openpyxl

# =====================
# 2. Imports
# =====================
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support

# =====================
# 3. Dummy dataset (replace with real reviews)
# =====================
data = {
    "review_text": [
        "Best pizza! Visit www.pizzapromo.com for discounts!",
        "I love my new phone, but this place is too noisy.",
        "Never been there, but I heard it’s terrible.",
        "Amazing service and friendly staff. Highly recommended!",
        "Cheap sunglasses!! Call +65-91234567 now!!"
    ]
}
df = pd.DataFrame(data)
display(df.head())

import pandas as pd

# =====================
# 4. Preprocessing
# =====================
try:
    df = pd.read_csv('/content/archive/reviews.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'reviews.csv' not found. Please ensure it is in the correct directory.")
    exit()

# Initial Data Inspection
print("\n--- Initial Data Inspection ---")
print("First 5 rows:")
print(df.head())
print("\nDataFrame Info:")
df.info()
print("\nDescriptive Statistics:")
print(df.describe(include='all'))

# Check for and remove duplicate rows
num_duplicates = df.duplicated().sum()
if num_duplicates > 0:
    df.drop_duplicates(inplace=True)
    print(f"\nRemoved {num_duplicates} duplicate rows.")

# Handle missing values by dropping rows with any missing data
df.dropna(inplace=True)
print("\nMissing values after cleaning:")
print(df.isnull().sum())
print(f"Remaining rows after cleaning: {len(df)}")

import pandas as pd
import re

# =====================
# 5. Pseudo labels (simple for now, placeholder until LLM)
# =====================
def simple_label(text: str) -> str:
    if re.search(r"(http|www|promo|call|\+65)", str(text)):
        return "AD"
    elif "never been" in str(text).lower() or "heard" in str(text).lower():
        return "RNV"  # Rant without visit
    elif "phone" in str(text).lower():
        return "IRR"
    else:
        return "CLEAN"

df["label"] = df["text"].apply(simple_label)
display(df)

# Authenticate with Hugging Face
from huggingface_hub import login
from google.colab import userdata

# You should download your own personal Hugging Face token, and name it 'TikTokTechJam2025' in your Secrets tab on Colab.
huggingface_token = userdata.get('TikTokTechJam2025')

if huggingface_token:
  login(token=huggingface_token, new_session=False)
  print("Hugging Face login successful.")
else:
  print("Hugging Face token not found. Please add it to Colab secrets with the name 'TikTokTechJam2025'.")

import os
from PIL import Image

image_folder_path = "/content/archive/dataset/"
image_files = []

# Walk through the directory to find all image files
for root, _, files in os.walk(image_folder_path):
    for file in files:
        # You might want to add more image file extensions here if needed
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_files.append(os.path.join(root, file))

print(f"Found {len(image_files)} image files.")

# Dictionary to store image paths and their generated text descriptions
image_descriptions = {}

# Perform image-to-text generation
for image_path in image_files:
    try:
        image = Image.open(image_path).convert("RGB")
        text_description = image_to_text_pipeline(image)[0]['generated_text']
        image_descriptions[image_path] = text_description
        print(f"Processed {image_path}")
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        image_descriptions[image_path] = f"Error: {e}" # Store error message

print("\nImage descriptions generated.")

import os
from google.colab import userdata

# You should download your own personal Hugging Face token, and name it 'TikTokTechJam2025' in your Secrets tab on Colab.
huggingface_token = userdata.get('TikTokTechJam2025')

if huggingface_token:
  os.environ['HF_TOKEN'] = huggingface_token
  print("HuggingFace token successfully set in environment variable.")
else:
  print("HuggingFace token not found. Please add it to Colab secrets with the name 'TikTokTechJam2025'.")

import pprint

# Print the image_descriptions dictionary
pprint.pprint(image_descriptions)

# Define keywords and phrases that indicate irrelevant images
irrelevant_keywords = [
    "sign",
    "menu",
    "receipt",
    "menu board",
    "price list",
    "error",
    "loading",
    "unable to process",
    "a table with",
    "a man in a suit", # Often indicates a generic or stock photo
    "a large screen",
    "a tv screen",
    "a book with", # Can indicate generic menu or non-food items
    "a poster",
    "a person holding", # Can be generic or non-food related
]

# Criteria for flagging an image as "dirty":
# An image is considered dirty if its description contains any of the irrelevant keywords.
# This logic will be used in the next step to filter the images.

print("Defined criteria for identifying irrelevant images based on descriptions.")
print(f"Irrelevant keywords: {irrelevant_keywords}")

# Define keywords and phrases that indicate irrelevant images
irrelevant_keywords = [
    "sign",
    "menu",
    "receipt",
    "menu board",
    "price list",
    "error",
    "loading",
    "unable to process",
    "a table with",
    "a man in a suit", # Often indicates a generic or stock photo
    "a large screen",
    "a tv screen",
    "a book with", # Can indicate generic menu or non-food items
    "a poster",
    "a person holding", # Can be generic or non-food related
]

# Criteria for flagging an image as "dirty":
# An image is considered dirty if its description contains any of the irrelevant keywords.
# This logic will be used in the next step to filter the images.

print("Defined criteria for identifying irrelevant images based on descriptions.")
print(f"Irrelevant keywords: {irrelevant_keywords}")

# Initialize an empty list to store paths of images to be cleaned
images_to_clean = []

# Iterate through the image_descriptions dictionary
for image_path, description in image_descriptions.items():
    # Convert description to lowercase for case-insensitive matching
    description_lower = str(description).lower() # Ensure description is a string

    # Check if any irrelevant keyword is in the description
    for keyword in irrelevant_keywords:
        if keyword in description_lower:
            images_to_clean.append(image_path)
            # Break the inner loop once a keyword is found for this image
            break

# Print the number of images identified for cleaning
print(f"Identified {len(images_to_clean)} images for cleaning.")

# Print the list of images to be cleaned
print("List of images to clean:")
pprint.pprint(images_to_clean)

# Filter out rows where the 'photo' column is in the images_to_clean list AND the label is 'IRR' or 'RNV'
# Keep rows where EITHER the photo is NOT in images_to_clean OR the label is NOT 'IRR' or 'RNV'
df_filtered = df[~((df['photo'].isin(images_to_clean)) & (df['label'].isin(['IRR', 'RNV'])))].copy()

# Display the first few rows of the filtered DataFrame
print("Filtered DataFrame head:")
display(df_filtered.head())

# Display the shape of the filtered DataFrame
print("\nFiltered DataFrame shape:")
print(df_filtered.shape)

# Overwrite the original df with the filtered DataFrame
df = df_filtered

"""**Reasoning**:
Print the image_descriptions dictionary to examine the generated text descriptions and identify patterns for irrelevant images.


"""

# =====================
# 6. Baseline model: TF-IDF + Logistic Regression
# =====================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)
preds_test = model.predict(X_test_vec)

# =====================
# 7. Classification report + Confusion Matrix
# =====================
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt

# Classification report
print("Classification Report on Test Data:\n")
print(classification_report(y_test, preds_test, zero_division=0))

# Determine actual labels present in y_test and preds_test
labels_in_data = np.unique(np.concatenate([y_test, preds_test]))

# Confusion matrix
cm = confusion_matrix(y_test, preds_test, labels=labels_in_data)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels_in_data)
disp.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix")
plt.show()

# =====================
# 8. Per-class Precision and Recall Bar Chart
# =====================
precision, recall, f1, support = precision_recall_fscore_support(y_test, preds_test, zero_division=0)
labels = np.unique(y_test)

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.bar(labels, precision, color='skyblue')
plt.ylim(0,1)
plt.title('Per-class Precision')
plt.xlabel('Class')
plt.ylabel('Precision')

plt.subplot(1,2,2)
plt.bar(labels, recall, color='salmon')
plt.ylim(0,1)
plt.title('Per-class Recall')
plt.xlabel('Class')
plt.ylabel('Recall')

plt.tight_layout()
plt.show()

# =====================
# 9. Try the model on new reviews
# =====================
examples = [
    "Get 50% discount at www.shopdeal.com",
    "We visited yesterday, the food was great!",
    "Never been there but my friend said it’s awful",
]

vec = vectorizer.transform(examples)
preds_examples = model.predict(vec)

print("\nPredictions on New Examples:")
for text, pred in zip(examples, preds_examples):
    print(f"{text} -> {pred}")



"""# Task
Integrate image cleaning into the preprocessing steps for the "reviews.csv" dataset by using the generated image descriptions to identify and filter out irrelevant images.

## Review image descriptions

### Subtask:
Examine the generated `image_descriptions` to understand the types of descriptions produced and identify patterns that might indicate irrelevant or problematic images.

## Define image cleaning criteria

### Subtask:
Based on the review of `image_descriptions`, define criteria for what constitutes a "dirty" or irrelevant image.

**Reasoning**:
Define the criteria for dirty images based on the review of image descriptions.

## Identify images for cleaning

### Subtask:
Write code to iterate through the `image_descriptions` and identify the image paths that meet the cleaning criteria.

**Reasoning**:
Iterate through the image descriptions and identify images to clean based on the defined irrelevant keywords.

## Integrate image filtering with dataframe

### Subtask:
Modify the DataFrame loaded from `reviews.csv` to filter out rows corresponding to the images identified for cleaning. This might involve using the 'photo' column to link reviews to images.

**Reasoning**:
Filter the DataFrame to exclude rows where the 'photo' column matches any path in the `images_to_clean` list and then display the head and shape of the filtered DataFrame before overwriting the original df.

## Update preprocessing

### Subtask:
Ensure the subsequent preprocessing steps (like the pseudo-labeling and model training) use the DataFrame with the filtered images.

## Summary:

### Data Analysis Key Findings

*   A list of keywords and phrases indicative of irrelevant images was defined, including terms related to signs, menus, receipts, technical errors, and generic settings.
*   Based on these criteria, 379 images were identified for cleaning.
*   Filtering the DataFrame based on the identified images resulted in the removal of 2 rows from the original DataFrame, changing its shape from (1102, 7) to (1100, 7).

### Insights or Next Steps

*   The image cleaning process effectively removed a small number of rows associated with potentially irrelevant images, which could improve the quality of the dataset for subsequent analysis or modeling.
*   Further refinement of the `irrelevant_keywords` list could be explored to potentially identify and remove more irrelevant images or avoid removing relevant ones.

## Review image descriptions

### Subtask:
Examine the generated `image_descriptions` to understand the types of descriptions produced and identify patterns that might indicate irrelevant or problematic images.

**Reasoning**:
Print the image_descriptions dictionary to examine the generated text descriptions and identify patterns for irrelevant images.

## Define image cleaning criteria

### Subtask:
Based on the review of `image_descriptions`, define criteria for what constitutes a "dirty" or irrelevant image.

**Reasoning**:
Define the criteria for dirty images based on the review of image descriptions.

## Identify images for cleaning

### Subtask:
Write code to iterate through the `image_descriptions` and identify the image paths that meet the cleaning criteria.

**Reasoning**:
Iterate through the image descriptions and identify images to clean based on the defined irrelevant keywords.

## Integrate image filtering with dataframe

### Subtask:
Modify the DataFrame loaded from `reviews.csv` to filter out rows corresponding to the images identified for cleaning. This might involve using the 'photo' column to link reviews to images.

**Reasoning**:
Filter the DataFrame to exclude rows where the 'photo' column matches any path in the `images_to_clean` list and then display the head and shape of the filtered DataFrame before overwriting the original df.

## Update preprocessing

### Subtask:
Ensure the subsequent preprocessing steps (like the pseudo-labeling and model training) use the DataFrame with the filtered images.

## Summary:

### Data Analysis Key Findings

* A list of keywords and phrases indicative of irrelevant images was defined, including terms related to signs, menus, receipts, technical errors, and generic settings.
* Based on these criteria, 379 images were identified for cleaning.
* Filtering the DataFrame based on the identified images resulted in the removal of 2 rows from the original DataFrame, changing its shape from (1102, 7) to (1100, 7).

### Insights or Next Steps

* The image cleaning process effectively removed a small number of rows associated with potentially irrelevant images, which could improve the quality of the dataset for subsequent analysis or modeling.
* Further refinement of the `irrelevant_keywords` list could be explored to potentially identify and remove more irrelevant images or avoid removing relevant ones.
"""