In [31]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import spacy
import logging
from typing import List, Dict
from datetime import datetime

In [26]:
class BERTCrimeProcessor:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model_name = "Luna-Skywalker/BERT-crime-analysis"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name).to(self.device)
        self.nlp = spacy.load("en_core_web_lg")
        self.confidence_threshold = 0.7

    def process_text(self, text: str) -> Dict[str, float]:
        inputs = self.tokenizer(
            text,
            truncation=True,
            max_length=512,
            padding=True,
            return_tensors="pt"
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)[0]
            
        predictions = []
        for idx, prob in enumerate(probs):
            if prob > self.confidence_threshold:
                label = self.model.config.id2label[idx]
                predictions.append((label, float(prob)))
        
        return sorted(predictions, key=lambda x: x[1], reverse=True)

In [27]:
class EnhancedTextPreprocessor:
    def __init__(self):
        self.crime_processor = BERTCrimeProcessor()
        self.nlp = self.crime_processor.nlp
        
    def process_document(self, text: str, doc_id: str) -> Dict:
        try:
            # Clean and process text
            cleaned_text = self.clean_text(text)
            
            # Get BERT crime predictions
            crime_predictions = self.crime_processor.process_text(cleaned_text)
            
            # Extract entities
            entities = self.extract_entities(cleaned_text)
            
            # Create result
            result = {
                'filename': doc_id,
                'text': text,
                'processed_text': cleaned_text,
                'LOC': '; '.join(entities.get('LOC', [])),
                'PER': '; '.join(entities.get('PER', [])),
                'ORG': '; '.join(entities.get('ORG', [])),
                'MISC': '; '.join(entities.get('MISC', [])),
                'CRIME_TYPES': '; '.join([crime for crime, _ in crime_predictions]),
                'CRIME_CONFIDENCE': '; '.join([f"{conf:.3f}" for _, conf in crime_predictions]),
                'metadata': self.extract_metadata(cleaned_text, doc_id)
            }
            return result
            
        except Exception as e:
            logging.error(f"Error processing document {doc_id}: {str(e)}")
            return self.create_empty_result(doc_id, text)

    def extract_entities(self, text: str) -> Dict[str, List[str]]:
        doc = self.nlp(text)
        entities = {
            'LOC': set(),
            'PER': set(),
            'ORG': set(),
            'MISC': set()
        }
        
        label_mapping = {
            'GPE': 'LOC', 'LOC': 'LOC', 'FAC': 'LOC',
            'PERSON': 'PER',
            'ORG': 'ORG',
            'PRODUCT': 'MISC', 'EVENT': 'MISC', 'WORK_OF_ART': 'MISC',
            'LAW': 'MISC', 'LANGUAGE': 'MISC', 'NORP': 'MISC'
        }
        
        for ent in doc.ents:
            category = label_mapping.get(ent.label_, 'MISC')
            entities[category].add(ent.text)
        
        return {k: sorted(v) for k, v in entities.items()}

    def clean_text(self, text: str) -> str:
        if pd.isna(text):
            return ""
        text = str(text).strip()
        text = ' '.join(text.split())
        return text

    def create_empty_result(self, doc_id: str, text: str) -> Dict:
        return {
            'filename': doc_id,
            'text': text,
            'processed_text': '',
            'LOC': '',
            'PER': '',
            'ORG': '',
            'MISC': '',
            'CRIME_TYPES': '',
            'CRIME_CONFIDENCE': '',
            'metadata': {'error': 'Processing failed'}
        }

    def extract_metadata(self, text: str, doc_id: str) -> Dict:
        doc = self.nlp(text)
        dates = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
        
        return {
            'doc_id': doc_id,
            'timestamp': datetime.now().isoformat(),
            'length': len(text),
            'dates': dates
        }


In [28]:
def process_dataset(df: pd.DataFrame, batch_size: int = 32) -> pd.DataFrame:
    preprocessor = EnhancedTextPreprocessor()
    results = []
    
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        for _, row in batch.iterrows():
            text = row.get( 'Extracted Text', row.get('Text', ''))
            filename = row.get('Filename', row.get('PDF Path', f'doc_{len(results)}'))
            result = preprocessor.process_document(text, filename)
            results.append(result)
        
        if (i + batch_size) % 100 == 0:
            logging.info(f"Processed {i + batch_size}/{len(df)} documents")
    
    return pd.DataFrame(results)

In [30]:
pdf_path = '/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/pdfs.csv'
news_path = '/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/news.csv'
output_path = '/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/process1_3.csv'

pdf_df = pd.read_csv(pdf_path)
news_df = pd.read_csv(news_path)
input_df = pd.concat([news_df, pdf_df], ignore_index=True)

processed_df = process_dataset(input_df)
processed_df.to_csv(output_path, index = False)