In [22]:
import pandas as pd 
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import streamlit as st
from typing import List, Dict, Tuple
import json
import logging
from transformers import pipeline

In [23]:
class CrimeDataRAG:
    def __init__(self):
        # Initialize BERT sentence transformer for embeddings
        self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Initialize a simpler pipeline for text generation
        try:
            self.generator = pipeline('text-generation', 
                                   model='gpt2',
                                   max_length=200)
        except Exception as e:
            logging.error(f"Failed to load text generation model: {e}")
            self.generator = None
        
        # Initialize vector store
        self.vector_store = None
        self.documents = []

    def load_and_process_data(self, nodes_path: str, edges_path: str, patterns_path: str) -> None:
        """Load and process the crime network data"""
        try:
            # Load datasets
            nodes_df = pd.read_csv(nodes_path)
            edges_df = pd.read_csv(edges_path)
            patterns_df = pd.read_csv(patterns_path)
            
            # Create text documents for each entity
            for _, node in nodes_df.iterrows():
                doc = self._create_entity_document(node, edges_df, patterns_df)
                self.documents.append(doc)
                
            self._create_vector_store()
            
        except Exception as e:
            logging.error(f"Error loading data: {e}")
            raise

    def _create_entity_document(self, node: pd.Series, edges_df: pd.DataFrame, 
                              patterns_df: pd.DataFrame) -> str:
        """Create a text document for a single entity"""
        doc = [
            f"Entity: {node['Entity']}",
            f"Type: {node['Type']}",
            f"Number of Crimes: {node['NumCrimes']}",
            f"Crimes: {node['Crimes']}"
        ]

        # Add relationships
        entity_edges = edges_df[
            (edges_df['Source'] == node['Entity']) | 
            (edges_df['Target'] == node['Entity'])
        ]
        if not entity_edges.empty:
            doc.append("\nRelationships:")
            for _, edge in entity_edges.iterrows():
                other_entity = edge['Target'] if edge['Source'] == node['Entity'] else edge['Source']
                doc.append(f"- Connected to {other_entity} through {edge['Relationship']} "
                         f"(Crime: {edge['CrimeType']})")

        # Add patterns
        entity_patterns = patterns_df[patterns_df['Entity'] == node['Entity']]
        if not entity_patterns.empty:
            doc.append("\nCrime Patterns:")
            for _, pattern in entity_patterns.iterrows():
                doc.append(f"- Involved in {pattern['CrimeType']}")

        return "\n".join(doc)

    def _create_vector_store(self) -> None:
        """Initialize FAISS vector store with document embeddings"""
        embeddings = self.embed_model.encode(self.documents)
        dimension = embeddings.shape[1]
        self.vector_store = faiss.IndexFlatL2(dimension)
        self.vector_store.add(np.array(embeddings).astype('float32'))

    def get_relevant_context(self, query: str, k: int = 3) -> List[str]:
        """Retrieve relevant documents for a query"""
        query_embedding = self.embed_model.encode([query])
        D, I = self.vector_store.search(
            np.array(query_embedding).astype('float32'), k
        )
        return [self.documents[i] for i in I[0]]

    def generate_response(self, query: str, context: List[str]) -> str:
        """Generate a response based on the query and retrieved context"""
        if not self.generator:
            return "Text generation model not available. Using retrieval only.\n\n" + \
                   "\n".join(context)
            
        prompt = f"Based on this crime network information:\n{' '.join(context)}\n\n" + \
                f"Question: {query}\n\nAnswer:"
        
        try:
            response = self.generator(prompt, max_length=200)[0]['generated_text']
            return response.split("Answer:")[-1].strip()
        except Exception as e:
            logging.error(f"Error generating response: {e}")
            return "Error generating response. Here is the relevant context:\n\n" + \
                   "\n".join(context)


In [24]:
def create_streamlit_app():
    st.title("Crime Network Analysis Chatbot")
    
    @st.cache_resource
    def load_rag_system():
        try:
            rag = CrimeDataRAG()
            rag.load_and_process_data(
                "/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/Tablaeu Data/crime_network_clean_nodes.csv",
                "/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/Tablaeu Data/crime_network_clean_edges.csv",
                "/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/Tablaeu Data/crime_network_clean_patterns.csv",
                "/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/Tablaeu Data/entity_risk_scores.csv",
                "/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/Tablaeu Data/feature_importance.csv",
                "/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/process2_cleaned.csv",
                "/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/process3_crime_relationships_enhanced.csv"
            )
            return rag
        except Exception as e:
            st.error(f"Error initializing system: {e}")
            return None

    rag = load_rag_system()
    
    if not rag:
        st.error("Failed to initialize the system. Please check the logs.")
        return

    query = st.text_input("Ask a question about the crime network:")
    
    if query:
        try:
            with st.spinner("Searching relevant information..."):
                context = rag.get_relevant_context(query)
            
            with st.spinner("Generating response..."):
                response = rag.generate_response(query, context)
                
            st.write("Response:", response)
            
            with st.expander("View Source Context"):
                for i, doc in enumerate(context, 1):
                    st.text(f"Document {i}:\n{doc}\n")
                    
        except Exception as e:
            st.error(f"Error processing query: {e}")

In [25]:
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    create_streamlit_app()

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Device set to use mps:0
