In [None]:
import pandas as pd
import numpy as np

In [None]:
file_paths = [
    "/Users/lakshitgupta/Library/CloudStorage/OneDrive-SeattleUniversity/Quater5/Capstone/Datasets/reddit_costco_alcohol_posts_and_comments.csv",  
    "/Users/lakshitgupta/Library/CloudStorage/OneDrive-SeattleUniversity/Quater5/Capstone/Datasets/reddit_costco_posts_and_comments_1000.csv",   
    "/Users/lakshitgupta/Library/CloudStorage/OneDrive-SeattleUniversity/Quater5/Capstone/Datasets/reddit_costco_wholesale_posts_and_comments_1000.csv"    
]  

In [None]:
def clean_data(file_path):
    df = pd.read_csv(file_path)
    
    # Remove duplicate rows
    df = df.drop_duplicates()

    # Handle missing values
    df = df.dropna(subset=["post_title", "comment_body"])  # Remove rows with missing text
    df["comment_author"] = df["comment_author"].fillna("Anonymous")  

    # Convert UNIX timestamps to readable datetime
    df["post_created"] = pd.to_datetime(df["post_created"], unit="s")
    df["comment_created"] = pd.to_datetime(df["comment_created"], unit="s")

    # Standardize text: convert to lowercase and strip extra whitespace
    df["post_title"] = df["post_title"].str.lower().str.strip()
    df["comment_body"] = df["comment_body"].str.lower().str.strip()

    # Remove special characters (optional)
    df["post_title"] = df["post_title"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True)
    df["comment_body"] = df["comment_body"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True)

    # Filter out deleted comments
    df = df[~df["comment_body"].isin(["[deleted]", "[removed]"])]

    return df

In [None]:
# Process all files
cleaned_dataframes = [clean_data(file) for file in file_paths]

In [None]:
# Combine all cleaned datasets
final_df = pd.concat(cleaned_dataframes, ignore_index=True)

In [None]:
# Save the final cleaned dataset
final_cleaned_file = "/Users/lakshitgupta/Library/CloudStorage/OneDrive-SeattleUniversity/Quater5/Capstone/Datasets/Cleaned Dataset/cleaned_combined_reddit_costco_data.csv"
final_df.to_csv(final_cleaned_file, index=False)

In [None]:
print(f"Data cleaning completed. Combined cleaned data saved to {final_cleaned_file}")

In [None]:
##pip install spacy
#!python -m spacy download en_core_web_md


In [None]:
import re
import spacy
import pandas as pd
import numpy as np

In [None]:
nlp = spacy.load("en_core_web_md")

# Define keywords for categorization

product_keywords = [
    "kirkland", "kirkland signature", "kirkland wine", "kirkland coffee", "kirkland batteries",
    "kirkland dog food", "kirkland vodka", "kirkland whiskey",
    
    "laptop", "tv", "computer", "printer", "headphones", "smartphone", "tablet", "camera",
    "sound system", "gaming console", "airpods", "apple", "samsung", "hp", "dell", "sony",

    "pizza", "wine", "groceries", "milk", "eggs", "bread", "cheese", "meat", "snacks",
    "beverages", "chicken", "sushi", "frozen food", "organic food", "costco bakery",
    "dairy", "costco food court", "hot dog", "beef", "seafood", "kirkland nuts",

   
    "furniture", "sofa", "table", "mattress", "bed", "chair", "desk", "office furniture",
    "home decor", "costco warehouse furniture",

   
    "clothing", "shoes", "jackets", "coats", "jeans", "t-shirts", "athletic wear",
    "underwear", "socks",

  
    "vitamins", "supplements", "protein powder", "costco pharmacy", "medicine",
    "eyeglasses", "contact lenses", "hearing aids", "first aid kits",

 
    "detergent", "toilet paper", "paper towels", "dishwasher pods", "cleaning supplies",
    "costco kirkland soap", "garbage bags", "air fresheners", "dish soap",

 
    "tires", "camping", "bbq", "grill", "patio furniture", "generators", "outdoor lighting",
    "batteries", "car accessories", "lawn mower"
]

service_keywords = [
    "membership", "costco executive membership", "renewal", "membership upgrade",
    "costco gold star", "return policy", "refund", "warranty", "extended warranty",
    "costco gas", "costco receipt lookup",

    "customer service", "help desk", "assistance", "complaint", "staff", "employees",
    "rude service", "checkout", "lines", "crowded", "experience", "store hours",
    
    "delivery", "shipping", "same-day delivery", "instacart", "costco.com", "costco app",
    "online order", "tracking", "shipping delay", "pickup service", "curbside pickup",
    "in-store pickup",

    "discounts", "offers", "promo", "coupons", "deals", "costco sales", "cashback",
    "executive rewards", "gift cards", "rebates", "costco credit card", "membership perks",

    "costco travel", "vacation", "rental cars", "travel discounts", "car insurance",
    "health insurance", "costco visa card", "capital one", "financing", "costco mortgage services",
    "travel"

    "gas", "fuel", "car wash", "costco gas station", "tire center", "battery replacement",
    "oil change", "auto repair", "tire"
]

#product_keywords = ["kirkland", "laptop", "pizza", "wine", "electronics", "furniture", "tvs", "groceries", "clothing"]
#service_keywords = ["membership", "customer service", "refund", "return policy", "delivery", "checkout", "discounts"]

# Path to the combined cleaned dataset
cleaned_file_path = "/Users/lakshitgupta/Library/CloudStorage/OneDrive-SeattleUniversity/Quater5/Capstone/Datasets/Cleaned Dataset/cleaned_combined_reddit_costco_data.csv"


In [None]:
def semantic_similarity(text, keywords):
    """
    Computes similarity score between a text and category keywords using SpaCy word embeddings.
    Handles empty vectors to avoid warnings.
    """
    if not text.strip():
        return 0  

    doc = nlp(text)

    similarities = []
    for word in keywords:
        word_doc = nlp(word)
        if doc.has_vector and word_doc.has_vector: 
            similarities.append(doc.similarity(word_doc))

    return max(similarities) if similarities else 0  


def categorize_post(post_title, comment_body):
    """
    Categorizes the post as 'Product', 'Service', or 'General Discussion' based on keywords & semantic similarity.
    """
    text = f"{post_title} {comment_body}".lower()

    if any(re.search(rf"\b{kw}\b", text) for kw in product_keywords):
        return "Product"
    if any(re.search(rf"\b{kw}\b", text) for kw in service_keywords):
        return "Service"

    # Step 2: Semantic Similarity (Threshold: 0.7)
    product_similarity = semantic_similarity(text, product_keywords)
    service_similarity = semantic_similarity(text, service_keywords)

    if product_similarity > 0.7:
        return "Product"
    elif service_similarity > 0.7:
        return "Service"

    return "General Discussion"


In [None]:
df = pd.read_csv(cleaned_file_path)

df["post_title"] = df["post_title"].fillna("")
df["comment_body"] = df["comment_body"].fillna("")

df["category"] = df.apply(lambda row: categorize_post(row["post_title"], row["comment_body"]), axis=1)



In [None]:
# Save categorized dataset
categorized_file_path = "/Users/lakshitgupta/Library/CloudStorage/OneDrive-SeattleUniversity/Quater5/Capstone/Datasets/Cleaned Dataset/categorized_reddit_costco_data.csv"
df.to_csv(categorized_file_path, index=False)

print(f"Data categorization completed. Categorized data saved to {categorized_file_path}")
print(df["category"].value_counts())

In [1]:
#pip install transformers
#!pip install datasets

# 1. Install cmake (needed for building pyarrow)
!pip install cmake

# 2. Downgrade pyarrow to a stable version (e.g., 12.0.0 or 6.0.1)
!pip install pyarrow==12.0.0 --no-build-isolation --only-binary=:all:

# 3. Uninstall the current version of datasets
!pip uninstall datasets -y

# 4. Reinstall the latest compatible version of datasets
!pip install datasets --upgrade

# 5. Verify installation of pyarrow and datasets
!pip show pyarrow
!pip show datasets



Collecting cmake
  Downloading cmake-3.31.4-py3-none-macosx_10_10_universal2.whl.metadata (6.5 kB)
Downloading cmake-3.31.4-py3-none-macosx_10_10_universal2.whl (47.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.2/47.2 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: cmake
Successfully installed cmake-3.31.4
[31mERROR: Could not find a version that satisfies the requirement pyarrow==12.0.0 (from versions: 14.0.0, 14.0.1, 14.0.2, 15.0.0, 15.0.1, 15.0.2, 16.0.0, 16.1.0, 17.0.0, 18.0.0, 18.1.0, 19.0.0)[0m[31m
[0m[31mERROR: No matching distribution found for pyarrow==12.0.0[0m[31m
[0mFound existing installation: datasets 3.2.0
Uninstalling datasets-3.2.0:
  Successfully uninstalled datasets-3.2.0
Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Using cached datasets-3.2.0-py3-none-any.whl (480 kB)
Installing collected packages: datasets
Successfully installed datasets-3.2

In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [5]:


# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the cleaned dataset
cleaned_file_path = "/Users/lakshitgupta/Library/CloudStorage/OneDrive-SeattleUniversity/Quater5/Capstone/Datasets/Cleaned Dataset/cleaned_combined_reddit_costco_data.csv"
df = pd.read_csv(cleaned_file_path)

# Combine post title and comment body into a single text column
df["post_title"] = df["post_title"].fillna("")
df["comment_body"] = df["comment_body"].fillna("")
df["text"] = df["post_title"] + " " + df["comment_body"]

# Map categories to numeric labels for BERT
label_mapping = {"Product": 0, "Service": 1, "General Discussion": 2}
df["label"] = df["category"].map(label_mapping)

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)




Using device: cpu


KeyError: 'category'

In [None]:
# Load the tokenizer and tokenize the dataset
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Create a PyTorch Dataset class for BERT
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Prepare the dataset
train_dataset = RedditDataset(train_encodings, train_labels)
val_dataset = RedditDataset(val_encodings, val_labels)

# Load pre-trained BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
)

# Define a compute metrics function


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert")
print("Model fine-tuning completed and saved.")

# ================================
# Apply the Model to the Entire Dataset
# ================================
# Load the saved model and tokenizer


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_bert").to(device)
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_bert")

def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class = torch.argmax(logits, dim=1).item()
    reverse_mapping = {0: "Product", 1: "Service", 2: "General Discussion"}
    return reverse_mapping[predicted_class]

# Classify the entire dataset
df["predicted_category"] = df["text"].apply(classify_text)

# Save the classified dataset
classified_file_path = "/Users/lakshitgupta/Library/CloudStorage/OneDrive-SeattleUniversity/Quater5/Capstone/Datasets/Cleaned Dataset/classified_reddit_costco_data.csv"
df.to_csv(classified_file_path, index=False)

print(f"Classification completed. Classified data saved to {classified_file_path}")
print(df["predicted_category"].value_counts())