In [None]:
!pip install matplotlib

In [None]:
import pandas as pd
import networkx as nx
import re
import os
from datetime import datetime
import matplotlib.pyplot as plt
import pickle

class EventKnowledgeGraphExtractor:
    def __init__(self):
        # Enhanced date patterns
        self.date_patterns = [
            # Full month names with optional ordinal suffixes
            r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}\b',
            # Abbreviated month names with optional ordinal suffixes  
            r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}\b',
            # Numeric formats
            r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b',
            r'\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b',
        ]
        
        # Enhanced time patterns
        self.time_patterns = [
            # Standard 12-hour format with AM/PM (with or without space)
            r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm|a\.m\.|p\.m\.)\b',
            # Just hour with AM/PM
            r'\b\d{1,2}\s*(?:AM|PM|am|pm|a\.m\.|p\.m\.)\b',
            # With timezone
            r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)\s*(?:PST|EST|IST|CST|MST|PDT|EDT|CDT|MDT)\b',
            # 24-hour format
            r'\b\d{1,2}:\d{2}\b(?=\s*(?:hours?|in|on|at|$))',
        ]
        
        # Online/Virtual venue patterns
        self.online_venue_patterns = [
            r'\b(ONLINE)\b',
            r'\b(ZOOM)\b',
            r'\b(Virtual)\b',
            r'\b(Webinar)\b',
            r'\b(Online\s+Event)\b',
            r'\b(Virtual\s+Event)\b',
        ]
        
        # Location patterns (including typos)
        self.location_patterns = [
            r'\b(San[\s-]Francisco)\b',
            r'\b(San[\s-]Daiago)\b',
            r'\b(Sans\s+Francisco)\b',
            r'\b(Bangalore)\b',
            r'\b(Riyadh)\b',
            r'\bVenue:\s*([A-Za-z\s,]+?)(?=\n|$)',
        ]
        
        # Host patterns
        self.host_patterns = [
            r'hosted by\s+([^!.\n]+)',
            r'[Ww]armly,?\s*\n\s*(.+?)(?:\n|$)',
            r'[Bb]est,?\s*\n\s*(.+?)(?:\n|$)',
            r'Product School\s+\w+',
        ]
        
    def extract_event_name(self, text):
        """Extract event name from subject line or body"""
        # Try to get from subject line
        subject_match = re.search(r'Subject:\s*(?:You\'re Invited!\s*)?(.+?)(?:\n|$)', text, re.IGNORECASE)
        if subject_match:
            event = subject_match.group(1).strip()
            # Clean up common suffixes
            event = re.sub(r'\s+in\s+San[\s-]Francisco.*$', '', event, flags=re.IGNORECASE)
            event = re.sub(r'\s+with\s+Product School.*$', '', event, flags=re.IGNORECASE)
            event = re.sub(r'^You\'re Invited!\s*', '', event, flags=re.IGNORECASE)
            return event.strip()
        
        # Fallback: look for quoted event names
        quoted = re.search(r'"([^"]+)"', text)
        if quoted:
            return quoted.group(1).strip()
        
        return "Unknown Event"
    
    def extract_date(self, text):
        """Extract date using multiple patterns"""
        for pattern in self.date_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(0).strip()
        return "Date Not Found"
    
    def extract_time(self, text):
        """Extract time using multiple patterns"""
        for pattern in self.time_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(0).strip()
        return "Time Not Found"
    
    def extract_venue(self, text):
        """Extract venue with priority for physical locations"""
        # First check for online venues
        for pattern in self.online_venue_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                online_venue = match.group(1)
                # Still check for physical location mentioned
                for loc_pattern in self.location_patterns:
                    loc_match = re.search(loc_pattern, text, re.IGNORECASE)
                    if loc_match:
                        return f"{loc_match.group(1)} (Online)"
                return online_venue
        
        # Check for physical locations
        for pattern in self.location_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).strip()
        
        return "Venue Not Found"
    
    def extract_host(self, text):
        """Extract host/organizer information"""
        for pattern in self.host_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                host = match.group(1).strip()
                # Clean up
                host = re.sub(r'\[.*?\]', '', host)
                host = re.sub(r'\s+', ' ', host)
                if len(host) > 3 and len(host) < 100:
                    return host
        
        return "Product School Bangalore"  # Default host from data
    
    def extract_entities(self, text):
        """Extract all entities from text"""
        return {
            'event': self.extract_event_name(text),
            'date': self.extract_date(text),
            'time': self.extract_time(text),
            'venue': self.extract_venue(text),
            'host': self.extract_host(text)
        }
    
    def create_knowledge_graph(self, entities, row_id):
        """Create a NetworkX knowledge graph for extracted entities"""
        G = nx.DiGraph()
        
        # Central event node
        event_node = f"Event_{row_id}"
        G.add_node(event_node, type='event', name=entities['event'])
        
        # Add entity nodes and relationships
        if entities['date'] != "Date Not Found":
            date_node = f"Date_{row_id}"
            G.add_node(date_node, type='date', value=entities['date'])
            G.add_edge(event_node, date_node, relation='occurs_on')
        
        if entities['time'] != "Time Not Found":
            time_node = f"Time_{row_id}"
            G.add_node(time_node, type='time', value=entities['time'])
            G.add_edge(event_node, time_node, relation='starts_at')
        
        if entities['venue'] != "Venue Not Found":
            venue_node = f"Venue_{row_id}"
            G.add_node(venue_node, type='venue', value=entities['venue'])
            G.add_edge(event_node, venue_node, relation='located_at')
        
        if entities['host']:
            host_node = f"Host_{row_id}"
            G.add_node(host_node, type='host', value=entities['host'])
            G.add_edge(host_node, event_node, relation='hosts')
        
        return G
    
    def visualize_graph(self, G, filename, output_dir):
        """Visualize and save knowledge graph"""
        plt.figure(figsize=(12, 8))
        pos = nx.spring_layout(G, k=2, iterations=50)
        
        # Color nodes by type
        color_map = {
            'event': '#FF6B6B',
            'date': '#4ECDC4',
            'time': '#45B7D1',
            'venue': '#FFA07A',
            'host': '#98D8C8'
        }
        
        node_colors = [color_map.get(G.nodes[node].get('type', 'event'), '#gray') 
                       for node in G.nodes()]
        
        # Draw graph
        nx.draw(G, pos, node_color=node_colors, node_size=3000, 
                with_labels=False, arrows=True, edge_color='gray',
                arrowsize=20, arrowstyle='->', width=2)
        
        # Add labels
        labels = {}
        for node in G.nodes():
            node_data = G.nodes[node]
            if 'name' in node_data:
                labels[node] = node_data['name'][:30]
            elif 'value' in node_data:
                labels[node] = node_data['value'][:30]
            else:
                labels[node] = node[:15]
        
        nx.draw_networkx_labels(G, pos, labels, font_size=8)
        
        # Add edge labels
        edge_labels = nx.get_edge_attributes(G, 'relation')
        nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=7)
        
        plt.title(f"Knowledge Graph: {filename}", fontsize=14, fontweight='bold')
        plt.axis('off')
        plt.tight_layout()
        
        # Save
        output_path = os.path.join(output_dir, f"{filename}.png")
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
    
    def process_excel(self, excel_path, column_name, output_dir='KG_Output'):
        """Main processing function"""
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        graphs_dir = os.path.join(output_dir, 'graphs')
        os.makedirs(graphs_dir, exist_ok=True)
        
        # Read Excel
        print(f"Reading Excel file: {excel_path}")
        df = pd.read_excel(excel_path)
        
        if column_name not in df.columns:
            raise ValueError(f"Column '{column_name}' not found in Excel file!")
        
        # Storage for results
        all_graphs = {}
        extracted_data = []
        
        # Process each row
        for idx, row in df.iterrows():
            text = str(row[column_name])
            
            if pd.isna(text) or text.strip() == '':
                continue
            
            print(f"Processing row {idx + 1}/{len(df)}...")
            
            # Extract entities
            entities = self.extract_entities(text)
            
            # Create knowledge graph
            G = self.create_knowledge_graph(entities, idx)
            all_graphs[f"event_{idx}"] = G
            
            # Visualize and save graph
            self.visualize_graph(G, f"KG_Event_{idx}", graphs_dir)
            
            # Store extracted data
            extracted_data.append({
                'Row_ID': idx,
                'Extracted_Event': entities['event'],
                'Extracted_Host': entities['host'],
                'Extracted_Venue': entities['venue'],
                'Extracted_Date': entities['date'],
                'Extracted_Time': entities['time']
            })
        
        # Save all graphs as pickle
        graphs_path = os.path.join(output_dir, 'all_knowledge_graphs.pkl')
        with open(graphs_path, 'wb') as f:
            pickle.dump(all_graphs, f)
        print(f"Saved all knowledge graphs to: {graphs_path}")
        
        # Create output DataFrame and Excel
        output_df = pd.DataFrame(extracted_data)
        output_excel_path = os.path.join(output_dir, 'extracted_KG_entities.xlsx')
        output_df.to_excel(output_excel_path, index=False)
        print(f"Saved extracted entities to: {output_excel_path}")
        
        # Print summary
        print("\n" + "="*60)
        print("EXTRACTION SUMMARY")
        print("="*60)
        print(f"Total events processed: {len(extracted_data)}")
        print(f"Knowledge graphs created: {len(all_graphs)}")
        print(f"Graphs visualized: {len(extracted_data)}")
        print(f"\nOutput files:")
        print(f"  - Entities Excel: {output_excel_path}")
        print(f"  - Graphs Pickle: {graphs_path}")
        print(f"  - Graph Images: {graphs_dir}/")
        print("="*60)
        
        return output_df, all_graphs


# Main execution
if __name__ == "__main__":
    # Initialize extractor
    extractor = EventKnowledgeGraphExtractor()
    
    # Configuration
    EXCEL_FILE = "generated_email_responses_1 user.xlsx"
    COLUMN_NAME = "response_Casual_Friendly_User"
    OUTPUT_DIR = "KG_Output"
    
    try:
        # Process the Excel file
        extracted_df, knowledge_graphs = extractor.process_excel(
            excel_path=EXCEL_FILE,
            column_name=COLUMN_NAME,
            output_dir=OUTPUT_DIR
        )
        
        # Display sample results
        print("\nSample Extracted Data:")
        print(extracted_df.head(10).to_string())
        
    except FileNotFoundError:
        print(f"ERROR: Excel file '{EXCEL_FILE}' not found!")
        print("Please ensure the file is in the same directory as this script.")
    except Exception as e:
        print(f"ERROR: {str(e)}")
        import traceback
        traceback.print_exc()