In [7]:
import pandas as pd
import spacy
import networkx as nx
from typing import List, Dict, Tuple
import re
from collections import defaultdict


In [8]:
tokenizer = AutoTokenizer.from_pretrained("albert/albert-large-v2")
model = AutoModelForMaskedLM.from_pretrained("albert/albert-large-v2")

# Move model to CPU
device = torch.device("cpu")
model.to(device)


Some weights of the model checkpoint at albert/albert-large-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AlbertForMaskedLM(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=1024, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, b

In [9]:
class OrganizationalCrimeExtractor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_lg")
        
        # Organization-Person patterns
        self.org_person_patterns = {
            'INSTRUCTED': r'(?i)(instruct|direct|order|command|ask)',
            'COLLABORATED': r'(?i)(collaborate|work with|partner|conspire)',
            'FUNDED': r'(?i)(fund|finance|pay|sponsor)',
            'EMPLOYED': r'(?i)(employ|hire|contract)',
            'FACILITATED': r'(?i)(facilitate|enable|help|assist)',
            'SUPERVISED': r'(?i)(supervise|oversee|manage)',
            'AUTHORIZED': r'(?i)(authorize|approve|permit|allow)'
        }
        
        # Organization-Organization patterns
        self.org_org_patterns = {
            'PARTNERSHIP': r'(?i)(partner|collaborate|alliance|joint venture)',
            'FUNDING': r'(?i)(fund|invest|finance|transfer money)',
            'CONTROL': r'(?i)(control|own|acquire|merge|takeover)',
            'SUPPLY': r'(?i)(supply|provide|deliver|distribute)',
            'CONSPIRACY': r'(?i)(conspire|scheme|plot|collude)',
            'FRONT': r'(?i)(front|shell|cover|facade)',
            'FACILITATION': r'(?i)(facilitate|enable|support|assist)'
        }
        
        # Criminal activity indicators
        self.crime_indicators = {
            'FRAUD': r'(?i)(fraud|scam|deceive|misrepresent)',
            'MONEY_LAUNDERING': r'(?i)(launder|clean money|illegal funds)',
            'CORRUPTION': r'(?i)(corrupt|bribe|kickback)',
            'TRAFFICKING': r'(?i)(traffic|smuggle|illegal trade)',
            'CYBERCRIME': r'(?i)(hack|cyber|digital theft|ransomware)',
            'CONSPIRACY': r'(?i)(conspire|plot|scheme|plan)',
            'TAX_EVASION': r'(?i)(tax evasion|tax fraud|undeclared)',
            'ILLEGAL_TRADE': r'(?i)(illegal trade|black market|contraband)'
        }

    def extract_relationships(self, row: pd.Series) -> List[Dict]:
        relationships = []
        text = row['processed_text']
        
        if pd.isna(text) or text.strip() == '':
            return relationships

        doc = self.nlp(text)
        entities = self._parse_entities(row)
        
        # Extract org-person relationships
        relationships.extend(self._extract_org_person_relations(doc, entities))
        # Extract org-org relationships
        relationships.extend(self._extract_org_org_relations(doc, entities))
        # Extract criminal activities
        relationships.extend(self._extract_criminal_activities(doc, entities))
        
        return relationships

    def _parse_entities(self, row: pd.Series) -> Dict[str, set]:
        entities = {}
        for field in ['PER', 'ORG', 'CRIME_TYPES']:
            if pd.notna(row[field]) and row[field].strip():
                entities[field] = {
                    self._clean_entity_name(e.strip()) 
                    for e in row[field].split(';') 
                    if e.strip()
                }
            else:
                entities[field] = set()
        return entities

    def _clean_entity_name(self, name: str) -> str:
        prefixes = ['mr.', 'mrs.', 'ms.', 'dr.', 'the']
        name = name.lower()
        for prefix in prefixes:
            if name.startswith(prefix + ' '):
                name = name[len(prefix)+1:]
        return name.strip().title()

    def _extract_org_person_relations(self, doc, entities: Dict[str, set]) -> List[Dict]:
        relationships = []
        
        for sent in doc.sents:
            sent_text = sent.text.lower()
            
            for org in entities['ORG']:
                for person in entities['PER']:
                    if org.lower() in sent_text and person.lower() in sent_text:
                        for rel_type, pattern in self.org_person_patterns.items():
                            if re.search(pattern, sent_text):
                                crime_types = [ct for ct in entities['CRIME_TYPES'] 
                                            if ct.lower() in sent_text]
                                crime_type = crime_types[0] if crime_types else "Unknown"
                                
                                relationships.append({
                                    'subject': org,
                                    'subject_type': 'ORG',
                                    'predicate': f'ORG_PER_{rel_type}',
                                    'object': person,
                                    'object_type': 'PER',
                                    'crime_type': crime_type,
                                    'sentence': sent.text,
                                    'evidence_strength': self._assess_evidence_strength(sent_text)
                                })
        
        return relationships

    def _extract_org_org_relations(self, doc, entities: Dict[str, set]) -> List[Dict]:
        """Extract relationships between organizations."""
        relationships = []
        
        for sent in doc.sents:
            sent_text = sent.text.lower()
            
            # Check pairs of organizations
            orgs = list(entities['ORG'])
            for i, org1 in enumerate(orgs):
                for org2 in orgs[i+1:]:  # Avoid self-relationships
                    if org1.lower() in sent_text and org2.lower() in sent_text:
                        for rel_type, pattern in self.org_org_patterns.items():
                            if re.search(pattern, sent_text):
                                crime_types = [ct for ct in entities['CRIME_TYPES'] 
                                            if ct.lower() in sent_text]
                                crime_type = crime_types[0] if crime_types else "Unknown"
                                
                                relationships.append({
                                    'subject': org1,
                                    'subject_type': 'ORG',
                                    'predicate': f'ORG_ORG_{rel_type}',
                                    'object': org2,
                                    'object_type': 'ORG',
                                    'crime_type': crime_type,
                                    'sentence': sent.text,
                                    'evidence_strength': self._assess_evidence_strength(sent_text)
                                })
        
        return relationships

    def _extract_criminal_activities(self, doc, entities: Dict[str, set]) -> List[Dict]:
        relationships = []
        
        for sent in doc.sents:
            sent_text = sent.text.lower()
            
            for crime_type, pattern in self.crime_indicators.items():
                if re.search(pattern, sent_text):
                    # Find involved organizations and persons
                    orgs = [org for org in entities['ORG'] 
                           if org.lower() in sent_text]
                    persons = [per for per in entities['PER'] 
                             if per.lower() in sent_text]
                    
                    # Create org-person relationships
                    for org in orgs:
                        for person in persons:
                            relationships.append({
                                'subject': org,
                                'subject_type': 'ORG',
                                'predicate': crime_type,
                                'object': person,
                                'object_type': 'PER',
                                'crime_type': crime_type,
                                'sentence': sent.text,
                                'evidence_strength': self._assess_evidence_strength(sent_text)
                            })
                    
                    # Create org-org relationships
                    for i, org1 in enumerate(orgs):
                        for org2 in orgs[i+1:]:
                            relationships.append({
                                'subject': org1,
                                'subject_type': 'ORG',
                                'predicate': crime_type,
                                'object': org2,
                                'object_type': 'ORG',
                                'crime_type': crime_type,
                                'sentence': sent.text,
                                'evidence_strength': self._assess_evidence_strength(sent_text)
                            })
        
        return relationships

    def _assess_evidence_strength(self, text: str) -> str:
        strong_indicators = r'(?i)(confirm|prove|evidence|document|record|witness)'
        moderate_indicators = r'(?i)(allege|suspect|believe|report)'
        weak_indicators = r'(?i)(rumor|might|maybe|possibly|could)'
        
        if re.search(strong_indicators, text):
            return 'strong'
        elif re.search(moderate_indicators, text):
            return 'moderate'
        elif re.search(weak_indicators, text):
            return 'weak'
        return 'unspecified'

In [10]:
def process_dataset(df: pd.DataFrame) -> pd.DataFrame:
    extractor = OrganizationalCrimeExtractor()
    relationships = []
    
    for idx, row in df.iterrows():
        row_relationships = extractor.extract_relationships(row)
        for rel in row_relationships:
            rel['source_idx'] = idx
        relationships.extend(row_relationships)
    
    rel_df = pd.DataFrame(relationships)
    
    if not rel_df.empty:
        print("\nRelationship Analysis:")
        print(f"Total relationships: {len(rel_df)}")
        print("\nRelationship types:")
        print(rel_df['predicate'].value_counts())
        print("\nEntity type pairs:")
        print(pd.crosstab(rel_df['subject_type'], rel_df['object_type']))
        print("\nEvidence strength distribution:")
        print(rel_df['evidence_strength'].value_counts())
        print("\nTop crime types:")
        print(rel_df['crime_type'].value_counts().head())
    
    return rel_df

In [11]:
if __name__ == "__main__":
    # Load and process data
    df = pd.read_csv('/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/process2_cleaned.csv')
    relationships_df = process_dataset(df)
    
    # Save outputs
    relationships_df.to_csv('/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/process3_crime_relationships_enhanced.csv', index=False)
    


Relationship Analysis:
Total relationships: 15319

Relationship types:
predicate
ORG_ORG_FUNDING         3579
ORG_PER_INSTRUCTED      1762
ORG_PER_FUNDED          1705
ORG_PER_SUPERVISED      1051
ORG_ORG_CONTROL         1038
ORG_PER_EMPLOYED        1029
ORG_ORG_SUPPLY           947
FRAUD                    763
CONSPIRACY               687
ORG_ORG_FACILITATION     665
ORG_PER_AUTHORIZED       407
CORRUPTION               372
ORG_ORG_PARTNERSHIP      363
ORG_PER_FACILITATED      247
ORG_ORG_FRONT            244
TRAFFICKING              177
ORG_PER_COLLABORATED     133
ORG_ORG_CONSPIRACY        88
MONEY_LAUNDERING          35
CYBERCRIME                27
Name: count, dtype: int64

Entity type pairs:
object_type    ORG   PER
subject_type            
ORG           8088  7231

Evidence strength distribution:
evidence_strength
unspecified    9415
strong         2844
moderate       2783
weak            277
Name: count, dtype: int64

Top crime types:
crime_type
Unknown        13255
FRAUD     