In [35]:
import pandas as pd
import os
import re
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
# --- Try to import pycld2 as an alternative ---
try:
    import pycld2
except ImportError:
    print("Error: The 'pycld2' library is not installed.")
    print("Please install it using: pip install pycld2")
    # Define a dummy function to prevent further errors
    def detect_pycld2_language(text):
        return 'unknown'
    class PyCLD2Exception(Exception):
        pass


In [41]:
NUM_ITERATIONS = 5
BATCH_SIZE = 20000
CONFIDENCE_THRESHOLD = 0.95

# --- File Paths ---
MERGED_DATA_PATH = os.path.join('..', 'data', 'california_reviews_merged.csv')
OUTPUT_DATA_PATH = os.path.join('..', 'data', 'llm_labeled_subset_50k_self_trained.csv')
LOCAL_LABELS_PATH = os.path.join('..', 'data', 'llm_subset_labels_local.csv')

# --- Labeling Guidelines (Improved) ---
# This dictionary contains keywords and regex patterns for better initial labeling.
LOCAL_LABELS = {
    'spam': {
        'keywords': ['http', 'www', 'click here', 'sale', 'promo', 'discount', 'buy now', 'advertisement'],
        'regex': [r'https?://\S+', r'\b(www\.)\S+'] # Catches URLs
    },
    'irrelevant': {
        'keywords': ['movie', 'tv show', 'book', 'weather', 'news', 'nonsense', 'unrelated'],
        'regex': [r'']
    },
    'policy_violation': {
        'keywords': ['hate speech', 'fraud', 'illegal', 'threat', 'scam', 'ripoff'],
        'regex': [r'']
    },
    'rant': {
        'keywords': ['disgusting', 'terrible', 'horrible', 'trash', 'never again', 'worse', 'angry'],
        'regex': [r'!!!+', r'!!!', r'!!!+'] # Excessive punctuation
    }
}


In [37]:
def classify_review_locally(review_text):
    """
    Classifies a review based on improved keyword matching and regex patterns.
    Returns an integer label (0, 1, 2, 3, or 4).
    """
    text = str(review_text).lower()
    
    # Check for NaN values
    if pd.isna(review_text):
        return 0

    # Classify based on new, more robust rules
    if any(keyword in text for keyword in LOCAL_LABELS['spam']['keywords']) or \
       any(re.search(pattern, text) for pattern in LOCAL_LABELS['spam']['regex']):
        return 1
    elif any(keyword in text for keyword in LOCAL_LABELS['irrelevant']['keywords']):
        return 2
    elif any(keyword in text for keyword in LOCAL_LABELS['policy_violation']['keywords']):
        return 3
    elif any(keyword in text for keyword in LOCAL_LABELS['rant']['keywords']) or \
         any(re.search(pattern, text) for pattern in LOCAL_LABELS['rant']['regex']) or \
         (sum(1 for c in review_text if c.isupper()) / len(review_text) > 0.5 if len(review_text) > 0 else False): # High percentage of caps
        return 4
    else:
        return 0 # Valid Review

In [38]:
def is_english(text):
    """
    Detects if a given text is in English using the pycld2 library.
    Returns True if English with high confidence, False otherwise.
    """
    if not isinstance(text, str) or not text.strip():
        return False
    
    try:
        # pycld2 returns a tuple: is_reliable, bytes_consumed, languages
        # The languages list contains a tuple of (language_name, language_code, score)
        is_reliable, _, languages = pycld2.detect(text)
        language_code = languages[0][1] # Get the code of the most likely language
        
        # Check if the detected language is English and if the detection is reliable
        return language_code == 'en' and is_reliable
    except Exception:
        # Return False on any error during detection
        return False
    

In [42]:

if __name__ == "__main__":
    # --- 1. Load the data with performance optimizations ---
    try:
        column_dtypes = {
            'text': 'string',
            'user_id': 'string',
            'gmap_id': 'string',
        }
        df = pd.read_csv(
            MERGED_DATA_PATH,
            on_bad_lines='skip',
            engine='c'
        )
    except FileNotFoundError:
        print(f"Error: File not found at {MERGED_DATA_PATH}. Please ensure the data folder is in the correct location.")
        exit()
    
    # --- 1.5: Filter out non-English reviews ---
    print("\n--- Filtering non-English reviews ---")
    start_time_filter = time.time()
    initial_count = len(df)
    
    # Apply the language detection function to the 'text' column
    english_reviews_mask = [is_english(text) for text in tqdm(df['text'].tolist(), desc="Detecting Languages")]
    df = df[english_reviews_mask].copy()
    
    end_time_filter = time.time()
    print(f"Original dataset size: {initial_count} reviews.")
    print(f"Filtered dataset size: {len(df)} English reviews.")
    print(f"Language filtering took {end_time_filter - start_time_filter:.2f} seconds.")

    # --- 2. Take a random sample for initial local labeling ---
    SAMPLE_SIZE_LOCAL_LABELS = 500
    labeled_df = df.sample(n=SAMPLE_SIZE_LOCAL_LABELS, random_state=42).copy()
    unlabeled_df = df.drop(labeled_df.index)
    labeled_df['violation_type'] = -1

    print(f"\nStarting initial local labeling for {SAMPLE_SIZE_LOCAL_LABELS} reviews...")
    start_time = time.time()
    
    for index, row in tqdm(labeled_df.iterrows(), total=len(labeled_df), desc="Local Labeling"):
        labeled_df.at[index, 'violation_type'] = classify_review_locally(row['text'])
    
    end_time = time.time()
    print("\n--- Initial Local Labeling Complete! ---")
    print(f"Total time elapsed: {end_time - start_time:.2f} seconds.")

    # --- 3. Augment the initial labeled data ---
    print(f"\nProceeding with initial labeled data. Labeled dataset size is: {len(labeled_df)}")
    print("Distribution of initial labels:")
    print(labeled_df['violation_type'].value_counts())

    # --- 4. Initialize the models ---
    print("\nInitializing Sentence-BERT and Logistic Regression models...")
    model_emb = SentenceTransformer('all-MiniLM-L6-v2')
    clf = LogisticRegression(max_iter=2000)
    
    # --- 5. Begin the self-training loop ---
    for i in range(NUM_ITERATIONS):
        print(f"\n--- Self-Training Iteration {i+1}/{NUM_ITERATIONS} ---")
        
        # Split the current labeled data into train/test sets for evaluation
        X_labeled = labeled_df.copy()
        y_labeled = labeled_df['violation_type']
        
        if y_labeled.nunique() > 1 and all(y_labeled.value_counts() > 1):
            X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42, stratify=y_labeled)
        else:
            print("Cannot perform stratified split. Not enough samples in some classes.")
            X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)
        
        # Encode the labeled text data
        X_train_emb_text = model_emb.encode(X_train['text'].tolist(), show_progress_bar=True)
        X_test_emb_text = model_emb.encode(X_test['text'].tolist(), show_progress_bar=True)
        
        clf.fit(X_train_emb_text, y_train)
        
        accuracy = clf.score(X_test_emb_text, y_test)
        print(f"Model trained. Accuracy on test set: {accuracy:.2f}")
        print(f"Current labeled dataset size: {len(labeled_df)}")
        
        if len(unlabeled_df) < BATCH_SIZE:
            print("Not enough unlabeled data for a full batch. Using remaining data.")
            batch_df = unlabeled_df.copy()
            unlabeled_df = pd.DataFrame()
        else:
            batch_df = unlabeled_df.sample(n=BATCH_SIZE, random_state=42).copy()
            unlabeled_df = unlabeled_df.drop(batch_df.index)

        batch_df.dropna(subset=['text'], inplace=True)
        
        print(f"\nPredicting on a new batch of {len(batch_df)} unlabeled reviews...")
        X_batch_emb_text = model_emb.encode(batch_df['text'].tolist(), show_progress_bar=True)
        
        batch_preds = clf.predict(X_batch_emb_text)
        batch_probs = clf.predict_proba(X_batch_emb_text)
        
        batch_max_probs = np.max(batch_probs, axis=1)
        
        confident_indices = np.where(batch_max_probs > CONFIDENCE_THRESHOLD)[0]
        
        confident_df = batch_df.iloc[confident_indices].copy()
        confident_df['violation_type'] = batch_preds[confident_indices]
        
        labeled_df = pd.concat([labeled_df, confident_df], ignore_index=True)
        
        print(f"Added {len(confident_df)} confident samples to the training data.")
        print(f"New labeled dataset size: {len(labeled_df)}")
        
    # --- 6. Final labeling and saving ---
    print("\n--- Self-Training Complete! ---")
    print(f"Final labeled dataset size: {len(labeled_df)}")
    
    labeled_df.to_csv(OUTPUT_DATA_PATH, index=False)
    print(f"\nFinal labeled dataset saved to: {OUTPUT_DATA_PATH}")
    print("\nDistribution of Model-assigned labels on the final dataset:")
    print(labeled_df['violation_type'].value_counts())
    print("\nThis labeled dataset is now ready for final model training or analysis.")



--- Filtering non-English reviews ---


Detecting Languages: 100%|██████████| 23258034/23258034 [03:47<00:00, 102403.46it/s]


Original dataset size: 23258034 reviews.
Filtered dataset size: 20938426 English reviews.
Language filtering took 238.13 seconds.

Starting initial local labeling for 500 reviews...


Local Labeling: 100%|██████████| 500/500 [00:00<00:00, 11723.86it/s]


--- Initial Local Labeling Complete! ---
Total time elapsed: 0.05 seconds.

Proceeding with initial labeled data. Labeled dataset size is: 500
Distribution of initial labels:
violation_type
0    466
4     22
1      6
2      6
Name: count, dtype: int64

Initializing Sentence-BERT and Logistic Regression models...






--- Self-Training Iteration 1/5 ---


Batches: 100%|██████████| 13/13 [00:01<00:00,  7.66it/s]
Batches: 100%|██████████| 4/4 [00:00<00:00,  4.96it/s]


Model trained. Accuracy on test set: 0.93
Current labeled dataset size: 500

Predicting on a new batch of 20000 unlabeled reviews...


Batches: 100%|██████████| 625/625 [01:25<00:00,  7.30it/s]
Detecting Languages:  80%|████████  | 18658347/23258034 [44:48<11:02, 6939.27it/s]  


Added 8118 confident samples to the training data.
New labeled dataset size: 8618

--- Self-Training Iteration 2/5 ---


Batches: 100%|██████████| 216/216 [00:40<00:00,  5.28it/s]
Batches: 100%|██████████| 54/54 [00:18<00:00,  2.89it/s]


Model trained. Accuracy on test set: 1.00
Current labeled dataset size: 8618

Predicting on a new batch of 20000 unlabeled reviews...


Batches: 100%|██████████| 625/625 [03:34<00:00,  2.91it/s]


Added 19102 confident samples to the training data.
New labeled dataset size: 27720

--- Self-Training Iteration 3/5 ---


Batches: 100%|██████████| 693/693 [03:24<00:00,  3.39it/s]
Batches: 100%|██████████| 174/174 [00:55<00:00,  3.12it/s]


Model trained. Accuracy on test set: 1.00
Current labeled dataset size: 27720

Predicting on a new batch of 20000 unlabeled reviews...


Batches: 100%|██████████| 625/625 [04:05<00:00,  2.55it/s]


Added 19999 confident samples to the training data.
New labeled dataset size: 47719

--- Self-Training Iteration 4/5 ---


Batches: 100%|██████████| 1193/1193 [06:03<00:00,  3.29it/s]
Batches: 100%|██████████| 299/299 [02:03<00:00,  2.42it/s]


Model trained. Accuracy on test set: 1.00
Current labeled dataset size: 47719

Predicting on a new batch of 20000 unlabeled reviews...


Batches: 100%|██████████| 625/625 [04:16<00:00,  2.44it/s]


Added 20000 confident samples to the training data.
New labeled dataset size: 67719

--- Self-Training Iteration 5/5 ---


Batches: 100%|██████████| 1693/1693 [08:29<00:00,  3.32it/s]
Batches: 100%|██████████| 424/424 [02:02<00:00,  3.45it/s]


Model trained. Accuracy on test set: 1.00
Current labeled dataset size: 67719

Predicting on a new batch of 20000 unlabeled reviews...


Batches: 100%|██████████| 625/625 [03:49<00:00,  2.72it/s]


Added 20000 confident samples to the training data.
New labeled dataset size: 87719

--- Self-Training Complete! ---
Final labeled dataset size: 87719

Final labeled dataset saved to: ..\data\llm_labeled_subset_50k_self_trained.csv

Distribution of Model-assigned labels on the final dataset:
violation_type
0    87685
4       22
1        6
2        6
Name: count, dtype: int64

This labeled dataset is now ready for final model training or analysis.
