# Getting the respective templates in an excel

In [14]:
import pandas as pd
import docx
import os
from openai import OpenAI
import json
import re
import dotenv

dotenv.load_dotenv()
openai_key = os.getenv('OPENAI_KEY')

In [19]:
class SportsTemplateExtractor:
    def __init__(self, openai_api_key, docx_folder=".", output_file="sports_templates.xlsx"):
        self.client = OpenAI(api_key=openai_api_key)
        self.docx_folder = docx_folder
        self.output_file = output_file
        self.templates_df = pd.DataFrame(columns=[
            'sport', 'event_category', 'gender', 'event_name', 
            'event_type', 'template', 'fields', 'sample_data'
        ])
    
    def read_docx(self, file_path):
        """Extract text from docx file"""
        try:
            doc = docx.Document(file_path)
            full_text = []
            for paragraph in doc.paragraphs:
                if paragraph.text.strip():
                    full_text.append(paragraph.text)
            return '\n'.join(full_text)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return ""
    
    def extract_sport_name(self, filename):
        """Extract sport name from filename"""
        sport_name = filename.replace('2023 ', '').replace(' Results', '')
        sport_name = sport_name.replace(' Whatsapp Template.docx', '')
        sport_name = sport_name.replace(' WhatsApp Template.docx', '')
        sport_name = sport_name.replace('...', ' ')
        sport_name = sport_name.replace(' - ', ' ')
        
        # Handle special cases
        special_cases = {
            'Volleyball (Indoor)': 'Volleyball Indoor',
            'Volleyball (Beach)': 'Volleyball Beach',
            'Hockey (Indoor)': 'Hockey Indoor',
            'Hockey (Field)': 'Hockey Field',
            'Traditional Boat Ra': 'Traditional Boat Racing',
            'Teqball (Demo Spo': 'Teqball',
            'Table Tennis - Res': 'Table Tennis',
            'Sepak Takraw Res': 'Sepak Takraw',
            'Pencak Silat Result': 'Pencak Silat',
            'Finswimming - Res': 'Finswimming',
            'Gymnastics - Resu': 'Gymnastics'
        }
        
        for key, value in special_cases.items():
            if key in filename:
                return value
        
        return sport_name.strip()
    
    def process_with_chatgpt(self, text, sport_name):
        """Use ChatGPT to extract templates from text"""
        prompt = f"""
        Analyze this sports results template for {sport_name} and extract all unique templates.
        
        Text: {text}
        
        For each unique template pattern found, identify:
        1. Event category (e.g., "WOMEN 200M HEAT", "MEN POLE VAULT FINAL")
        2. Gender (Men/Women/Mixed)
        3. Event name (e.g., "200M", "Pole Vault", "Marathon")
        4. Event type (Heat/Final/Qualification/Preliminary/etc.)
        5. Template structure with variables in {{}} format
        6. List of all variable fields
        7. Sample data from the text
        
        Look for patterns like:
        - Event headers with asterisks: *SPORT – GENDER EVENT TYPE*
        - Athlete names with country codes: NAME (SGP)
        - Performance metrics: Time:, Height:, Score:, etc.
        - Placement information: finished X out of Y
        - Additional comments about records, advancement, etc.
        
        Return as JSON with this structure:
        {{
            "templates": [
                {{
                    "event_category": "WOMEN 200M HEAT",
                    "gender": "Women",
                    "event_name": "200M",
                    "event_type": "Heat",
                    "template": "*{{SPORT}} – {{GENDER}} {{EVENT}} {{TYPE}}*\\n{{NAME}} (SGP)\\nTime: {{TIME}}. {{PRONOUN}} finished {{PLACEMENT}} out of {{TOTAL}} and {{ADVANCEMENT_STATUS}}. {{ADDITIONAL_COMMENTS}}",
                    "fields": ["SPORT", "GENDER", "EVENT", "TYPE", "NAME", "TIME", "PRONOUN", "PLACEMENT", "TOTAL", "ADVANCEMENT_STATUS", "ADDITIONAL_COMMENTS"],
                    "sample_data": "Actual example from the text"
                }}
            ]
        }}
        
        Extract ALL unique template patterns from the text.
        """
        
        try:
            response = self.client.chat.completions.create(
                model="gpt-4-turbo",
                messages=[
                    {"role": "system", "content": "You are an expert at analyzing sports result templates and extracting structured data. Always return valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=4000
            )
            
            content = response.choices[0].message.content
            if content.startswith('```json'):
                content = content.replace('```json', '').replace('```', '')
            
            result = json.loads(content)
            return result
        except json.JSONDecodeError as e:
            print(f"JSON decode error for {sport_name}: {e}")
            return {"templates": []}
        except Exception as e:
            print(f"Error processing {sport_name} with ChatGPT: {e}")
            return {"templates": []}
    
    def process_single_file(self, file_path):
        """Process a single docx file"""
        filename = os.path.basename(file_path)
        sport_name = self.extract_sport_name(filename)
        
        print(f"Processing: {sport_name}")
        
        text = self.read_docx(file_path)
        if not text:
            print(f"No text found in {filename}")
            return
        
        result = self.process_with_chatgpt(text, sport_name)
        
        templates_added = 0
        for template_data in result.get('templates', []):
            new_row = {
                'sport': sport_name,
                'event_category': template_data.get('event_category', ''),
                'gender': template_data.get('gender', ''),
                'event_name': template_data.get('event_name', ''),
                'event_type': template_data.get('event_type', ''),
                'template': template_data.get('template', ''),
                'fields': json.dumps(template_data.get('fields', [])),
                'sample_data': template_data.get('sample_data', '')
            }
            self.templates_df = pd.concat([self.templates_df, pd.DataFrame([new_row])], ignore_index=True)
            templates_added += 1
        
        print(f"  Added {templates_added} templates for {sport_name}")
    
    def process_all_files(self):
        """Process all docx files in folder"""
        docx_files = [f for f in os.listdir(self.docx_folder) if f.endswith('.docx')]
        print(f"Found {len(docx_files)} DOCX files to process")
        
        for i, filename in enumerate(docx_files, 1):
            print(f"\n[{i}/{len(docx_files)}] Processing: {filename}")
            file_path = os.path.join(self.docx_folder, filename)
            self.process_single_file(file_path)
    
    def save_to_excel(self):
        """Save templates to Excel"""
        if len(self.templates_df) == 0:
            print("No templates found to save!")
            return
            
        with pd.ExcelWriter(self.output_file, engine='openpyxl') as writer:
            self.templates_df.to_excel(writer, sheet_name='All Templates', index=False)
            
            summary_df = self.templates_df.groupby('sport').size().reset_index(name='template_count')
            summary_df.to_excel(writer, sheet_name='Summary by Sport', index=False)
            
            for sport in self.templates_df['sport'].unique():
                sport_df = self.templates_df[self.templates_df['sport'] == sport]
                sheet_name = sport[:31]
                sport_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        print(f"\nTemplates saved to: {self.output_file}")
        print(f"Total templates extracted: {len(self.templates_df)}")
        print(f"Sports covered: {len(self.templates_df['sport'].unique())}")
    
    def run(self):
        """Main execution method"""
        print("Starting Sports Template Extraction...")
        self.process_all_files()
        self.save_to_excel()
        print("Extraction complete!")



OPENAI_API_KEY = openai_key 
DOCX_FOLDER = "Major-Games-Reporting/Whatsapp Templates/Whatsapp Templates" 
OUTPUT_FILE = "sports_templates.xlsx"


# Run extraction
extractor = SportsTemplateExtractor(
    openai_api_key=OPENAI_API_KEY,
    docx_folder=DOCX_FOLDER,
    output_file=OUTPUT_FILE
)

extractor.run()


Starting Sports Template Extraction...
Found 37 DOCX files to process

[1/37] Processing: 2023 Gymnastics - Results Whatsapp Template.docx
Processing: Gymnastics
  Added 3 templates for Gymnastics

[2/37] Processing: 2023 Sepak Takraw Results Whatsapp Template.docx
Processing: Sepak Takraw
  Added 6 templates for Sepak Takraw

[3/37] Processing: 2023 E-Sports Results Whatsapp Template.docx
Processing: E-Sports
  Added 5 templates for E-Sports

[4/37] Processing: 2023 Billiards Sports Results Whatsapp Template.docx
Processing: Billiards Sports
  Added 5 templates for Billiards Sports

[5/37] Processing: 2023 Volleyball (Beach) Results WhatsApp Template.docx
Processing: Volleyball Beach
  Added 6 templates for Volleyball Beach

[6/37] Processing: 2023 Traditional Boat Race Results WhatsApp Template.docx
Processing: Traditional Boat Racing
  Added 7 templates for Traditional Boat Racing

[7/37] Processing: 2023 Floorball - Results WhatsApp Template.docx
Processing: Floorball -
  Added 4 t