# üîç Analyseur de Structure des Logs Mozilla CI

Ce notebook analyse la structure des logs pour identifier les patterns et pr√©parer le parsing.



---
## üì¶ 1: Installation des d√©pendances


In [1]:
# Installation des d√©pendances
!pip install -q rarfile
!apt-get install -qq unrar

print("‚úÖ D√©pendances install√©es avec succ√®s!")

‚úÖ D√©pendances install√©es avec succ√®s!


---
## üì§ 2: Upload des fichiers .rar

**3 fichiers .rar:**
- log-2018-06-01.rar
- log-2018-06-19.rar
- log-2018-06-20.rar

In [3]:
from google.colab import files

print("üì§ Veuillez s√©lectionner vos fichiers .rar...")
uploaded = files.upload()

print(f"\n‚úÖ {len(uploaded)} fichier(s) upload√©(s) avec succ√®s!")

üì§ Veuillez s√©lectionner vos fichiers .rar...


Saving log-2018-06-19.rar to log-2018-06-19.rar
Saving log-2018-06-20.rar to log-2018-06-20.rar
Saving log-2018-06-01.rar to log-2018-06-01.rar

‚úÖ 3 fichier(s) upload√©(s) avec succ√®s!


---
## üöÄ 3: Analyse

**Cette cellule contient tout le code d'analyse.**



In [4]:
import os
import re
import json
import random
from collections import defaultdict, Counter
from datetime import datetime
import rarfile

# Configuration
SAMPLE_SIZE = 100  # Nombre de fichiers √† analyser par .rar
OUTPUT_DIR = "/content/extracted_logs"
REPORT_FILE = "/content/structure_report.json"
SUMMARY_FILE = "/content/summary_report.txt"

print("=" * 80)
print("üîç ANALYSEUR DE STRUCTURE DES LOGS MOZILLA CI")
print("=" * 80)
print()

# Cr√©er les r√©pertoires
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Fonction: Extraction des .rar
def extract_rar_files(rar_paths):
    print("üì¶ EXTRACTION DES FICHIERS .RAR")
    print("-" * 80)
    extracted_files = {}

    for rar_path in rar_paths:
        if not os.path.exists(rar_path):
            continue

        day = os.path.basename(rar_path).split('-')[-1].replace('.rar', '')
        print(f"\nüìÇ Extraction de: {os.path.basename(rar_path)}")

        try:
            day_dir = os.path.join(OUTPUT_DIR, f"day_{day}")
            os.makedirs(day_dir, exist_ok=True)

            with rarfile.RarFile(rar_path) as rf:
                all_files = rf.namelist()
                txt_files = [f for f in all_files if f.endswith('.txt')]

                print(f"   üìä Nombre total de fichiers .txt: {len(txt_files)}")

                if len(txt_files) > SAMPLE_SIZE:
                    sampled_files = random.sample(txt_files, SAMPLE_SIZE)
                    print(f"   üìå √âchantillon s√©lectionn√©: {len(sampled_files)} fichiers")
                else:
                    sampled_files = txt_files

                for file in sampled_files:
                    rf.extract(file, day_dir)

                extracted_paths = []
                for file in sampled_files:
                    full_path = os.path.join(day_dir, file)
                    if os.path.exists(full_path):
                        extracted_paths.append(full_path)

                extracted_files[day] = extracted_paths
                print(f"   ‚úÖ {len(extracted_paths)} fichiers extraits")

        except Exception as e:
            print(f"   ‚ùå Erreur: {str(e)}")

    print("\n" + "=" * 80)
    print(f"‚úÖ EXTRACTION TERMIN√âE - Total: {sum(len(files) for files in extracted_files.values())} fichiers")
    print("=" * 80 + "\n")
    return extracted_files

# Fonction: Analyse d'un log
def analyze_log_structure(file_path):
    analysis = {
        'file_name': os.path.basename(file_path),
        'file_size': 0,
        'line_count': 0,
        'header': {},
        'sections': [],
        'has_errors': False,
        'error_count': 0,
        'has_performance_metrics': False,
        'timestamp_formats': set(),
        'log_levels': Counter(),
        'unique_patterns': set()
    }

    try:
        analysis['file_size'] = os.path.getsize(file_path)

        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()
            analysis['line_count'] = len(lines)

            # Parser le header
            for i, line in enumerate(lines[:20]):
                if ':' in line and not line.startswith('='):
                    parts = line.split(':', 1)
                    if len(parts) == 2:
                        key = parts[0].strip()
                        value = parts[1].strip()
                        analysis['header'][key] = value

            # Analyser le contenu
            for line in lines:
                # Sections
                if 'Started' in line and '=========' in line:
                    section_match = re.search(r"Started\s+(.+?)\s+\(results", line)
                    if section_match:
                        analysis['sections'].append(section_match.group(1))

                # Erreurs
                if re.search(r'\b(ERROR|FAIL|FAILURE|Exception|error)\b', line, re.IGNORECASE):
                    analysis['has_errors'] = True
                    analysis['error_count'] += 1

                # M√©triques
                if re.search(r'(CPU|RAM|Memory|I/O|bytes|utilization)', line, re.IGNORECASE):
                    analysis['has_performance_metrics'] = True

                # Timestamps
                timestamp_patterns = [
                    r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}',
                    r'\d{2}:\d{2}:\d{2}',
                    r'\d{10}\.\d+',
                ]
                for pattern in timestamp_patterns:
                    if re.search(pattern, line):
                        analysis['timestamp_formats'].add(pattern)

                # Log levels
                log_level_match = re.search(r'\s+(INFO|DEBUG|WARNING|ERROR|CRITICAL)\s+', line)
                if log_level_match:
                    analysis['log_levels'][log_level_match.group(1)] += 1

                # Patterns
                if 'TinderboxPrint' in line:
                    analysis['unique_patterns'].add('TinderboxPrint')
                if 'blobupload' in line:
                    analysis['unique_patterns'].add('blobupload')
                if 'master_lag' in line:
                    analysis['unique_patterns'].add('master_lag')

    except Exception as e:
        analysis['parse_error'] = str(e)

    # Convertir sets en listes
    analysis['timestamp_formats'] = list(analysis['timestamp_formats'])
    analysis['unique_patterns'] = list(analysis['unique_patterns'])
    analysis['log_levels'] = dict(analysis['log_levels'])

    return analysis

# Fonction: Analyse globale
def analyze_all_logs(extracted_files):
    print("üîç ANALYSE DE LA STRUCTURE DES LOGS")
    print("-" * 80)

    global_report = {
        'total_files_analyzed': 0,
        'analysis_timestamp': datetime.now().isoformat(),
        'days_analyzed': [],
        'builder_types': Counter(),
        'result_types': Counter(),
        'file_sizes': [],
        'line_counts': [],
        'common_sections': Counter(),
        'error_statistics': {'files_with_errors': 0, 'total_errors': 0},
        'performance_metrics_present': 0,
        'timestamp_formats_found': set(),
        'log_levels_distribution': Counter(),
        'unique_patterns_found': set(),
        'detailed_analyses': []
    }

    for day, files in extracted_files.items():
        print(f"\nüìÖ Analyse du jour {day}...")
        global_report['days_analyzed'].append(day)

        for i, file_path in enumerate(files):
            if i % 20 == 0:
                print(f"   Progression: {i}/{len(files)} fichiers analys√©s")

            analysis = analyze_log_structure(file_path)
            global_report['total_files_analyzed'] += 1

            if 'builder' in analysis['header']:
                global_report['builder_types'][analysis['header']['builder']] += 1
            if 'results' in analysis['header']:
                global_report['result_types'][analysis['header']['results']] += 1

            global_report['file_sizes'].append(analysis['file_size'])
            global_report['line_counts'].append(analysis['line_count'])

            for section in analysis['sections']:
                global_report['common_sections'][section] += 1

            if analysis['has_errors']:
                global_report['error_statistics']['files_with_errors'] += 1
                global_report['error_statistics']['total_errors'] += analysis['error_count']

            if analysis['has_performance_metrics']:
                global_report['performance_metrics_present'] += 1

            global_report['timestamp_formats_found'].update(analysis['timestamp_formats'])

            for level, count in analysis['log_levels'].items():
                global_report['log_levels_distribution'][level] += count

            global_report['unique_patterns_found'].update(analysis['unique_patterns'])

            if len(global_report['detailed_analyses']) < 10:
                global_report['detailed_analyses'].append(analysis)

        print(f"   ‚úÖ {len(files)} fichiers analys√©s pour le jour {day}")

    # Convertir sets en listes
    global_report['timestamp_formats_found'] = list(global_report['timestamp_formats_found'])
    global_report['unique_patterns_found'] = list(global_report['unique_patterns_found'])
    global_report['builder_types'] = dict(global_report['builder_types'])
    global_report['result_types'] = dict(global_report['result_types'])
    global_report['common_sections'] = dict(global_report['common_sections'])
    global_report['log_levels_distribution'] = dict(global_report['log_levels_distribution'])

    # Statistiques
    if global_report['file_sizes']:
        global_report['avg_file_size'] = sum(global_report['file_sizes']) / len(global_report['file_sizes'])
        global_report['min_file_size'] = min(global_report['file_sizes'])
        global_report['max_file_size'] = max(global_report['file_sizes'])

    if global_report['line_counts']:
        global_report['avg_line_count'] = sum(global_report['line_counts']) / len(global_report['line_counts'])
        global_report['min_line_count'] = min(global_report['line_counts'])
        global_report['max_line_count'] = max(global_report['line_counts'])

    print("\n" + "=" * 80)
    print("‚úÖ ANALYSE TERMIN√âE")
    print("=" * 80 + "\n")

    return global_report

# Fonction: G√©n√©ration des rapports
def generate_reports(global_report):
    print("üìù G√âN√âRATION DES RAPPORTS")
    print("-" * 80)

    # JSON
    with open(REPORT_FILE, 'w', encoding='utf-8') as f:
        json.dump(global_report, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Rapport JSON: {REPORT_FILE}")

    # Texte
    summary = []
    summary.append("=" * 80)
    summary.append("üìä RAPPORT D'ANALYSE DES LOGS MOZILLA CI")
    summary.append("=" * 80)
    summary.append("")
    summary.append("üî¢ STATISTIQUES GLOBALES")
    summary.append("-" * 80)
    summary.append(f"Fichiers analys√©s: {global_report['total_files_analyzed']}")
    summary.append(f"Jours: {', '.join(global_report['days_analyzed'])}")
    summary.append("")
    summary.append("üìè TAILLE DES FICHIERS")
    summary.append("-" * 80)
    summary.append(f"Taille moyenne: {global_report.get('avg_file_size', 0):,.0f} octets ({global_report.get('avg_file_size', 0)/1024/1024:.2f} MB)")
    summary.append("")
    summary.append("üèóÔ∏è  TYPES DE BUILDERS (Top 10)")
    summary.append("-" * 80)
    for builder, count in sorted(global_report['builder_types'].items(), key=lambda x: x[1], reverse=True)[:10]:
        summary.append(f"  ‚Ä¢ {builder}: {count}")
    summary.append("")
    summary.append("‚úÖ R√âSULTATS DES BUILDS")
    summary.append("-" * 80)
    for result, count in sorted(global_report['result_types'].items(), key=lambda x: x[1], reverse=True):
        percentage = (count / global_report['total_files_analyzed']) * 100
        summary.append(f"  ‚Ä¢ {result}: {count} ({percentage:.1f}%)")
    summary.append("")
    summary.append("=" * 80)

    with open(SUMMARY_FILE, 'w', encoding='utf-8') as f:
        f.write('\n'.join(summary))
    print(f"‚úÖ R√©sum√© texte: {SUMMARY_FILE}")
    print()
    print('\n'.join(summary))

# EX√âCUTION PRINCIPALE
print("üöÄ D√âMARRAGE...\n")

# Trouver les fichiers .rar
rar_files = [os.path.join('/content', f) for f in os.listdir('/content') if f.endswith('.rar')]

if not rar_files:
    print("‚ùå Aucun fichier .rar trouv√©!")
else:
    print(f"‚úÖ {len(rar_files)} fichier(s) .rar trouv√©(s)\n")
    extracted_files = extract_rar_files(rar_files)

    if extracted_files:
        global_report = analyze_all_logs(extracted_files)
        generate_reports(global_report)

        print("\n" + "=" * 80)
        print("üéâ ANALYSE TERMIN√âE AVEC SUCC√àS!")
        print("=" * 80)
        print("\nüìÅ Fichiers g√©n√©r√©s:")
        print(f"   ‚Ä¢ {REPORT_FILE}")
        print(f"   ‚Ä¢ {SUMMARY_FILE}")


üîç ANALYSEUR DE STRUCTURE DES LOGS MOZILLA CI

üöÄ D√âMARRAGE...

‚úÖ 3 fichier(s) .rar trouv√©(s)

üì¶ EXTRACTION DES FICHIERS .RAR
--------------------------------------------------------------------------------

üìÇ Extraction de: log-2018-06-01.rar
   üìä Nombre total de fichiers .txt: 1452
   üìå √âchantillon s√©lectionn√©: 100 fichiers
   ‚úÖ 100 fichiers extraits

üìÇ Extraction de: log-2018-06-19.rar
   üìä Nombre total de fichiers .txt: 2179
   üìå √âchantillon s√©lectionn√©: 100 fichiers
   ‚úÖ 100 fichiers extraits

üìÇ Extraction de: log-2018-06-20.rar
   üìä Nombre total de fichiers .txt: 861
   üìå √âchantillon s√©lectionn√©: 100 fichiers
   ‚úÖ 100 fichiers extraits

‚úÖ EXTRACTION TERMIN√âE - Total: 300 fichiers

üîç ANALYSE DE LA STRUCTURE DES LOGS
--------------------------------------------------------------------------------

üìÖ Analyse du jour 01...
   Progression: 0/100 fichiers analys√©s
   Progression: 20/100 fichiers analys√©s
   Progression: 40