In [None]:
import pandas as pd
from transformers import pipeline
import pickle
import json

In [None]:
# Navigate to parent directory
import os
os.chdir('../')
from src.data_processing import preprocess

In [None]:
# Load and preprocess data
df = pd.read_csv('data/.csv')
reviews = df['Text'].tolist()
df['Text'] = df['Text'].apply(preprocess)

In [None]:
# Topics the model will use to classify reviews

with open('data/topics.json', 'r') as f:
    topics_data = json.load(f)
candidate_topics = topics_data['candidate_topics']

In [None]:
# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
# Classify each review
results = classifier(reviews, candidate_topics, multi_label=False)

In [None]:
# Shows predicted topic and confidence score for each
df['predicted_topic'] = [result['labels'][0] for result in results]
df['topic_confidence'] = [result['scores'][0] for result in results]

print(df[['Text', 'predicted_topic', 'topic_confidence']].head())

# Show topic distribution
print("\nTopic Distribution:")
print(df['predicted_topic'].value_counts())

In [None]:
# Define the directory
model_dir = "./models/zero_shot_model"
os.makedirs(model_dir, exist_ok=True)

In [None]:
# Save the *pipeline* itself.  That's what contains the model and tokenizer.
model_path = os.path.join(model_dir, "zero_shot_pipeline.pkl")
with open(model_path, "wb") as f:
    pickle.dump(classifier, f)
print(f"Zero-shot classification model saved to {model_path}")