## Instalar requisitos

In [None]:
%pip install pandas
%pip install "betterproto[compiler]" protobuf
%pip install matplotlib
%pip install scipy
%pip install pyQt5
%pip install scikit-learn
%pip install numpy
%matplotlib qt      
%matplotlib

## Pyproto generation

In [3]:
import sys, os
from pathlib import Path

plugin_dir = Path.cwd() / ".protoc_plugins"
plugin_dir.mkdir(exist_ok=True)

# para cobrir ambas as formas (h√≠fen e underscore)
bat = f'@echo off\r\n"{sys.executable}" -m betterproto.plugin %*\r\n'
(plugin_dir / "protoc-gen-python_betterproto.cmd").write_text(bat, encoding="utf-8")
(plugin_dir / "protoc-gen-python-betterproto.cmd").write_text(bat, encoding="utf-8")

# injeta pasta no PATH do processo do kernel
os.environ["PATH"] = str(plugin_dir) + os.pathsep + os.environ["PATH"]

print("Wrappers em:", plugin_dir)

from utils.auto_generate_proto import generate_proto_classes

generate_proto_classes()

Wrappers em: C:\Users\victo\Documents\in1144-data-mining-2025-2\.protoc_plugins
CMD: C:\ProgramData\chocolatey\bin\protoc.EXE --plugin=protoc-gen-python_betterproto=C:\Users\victo\Documents\in1144-data-mining-2025-2\.protoc_plugins\protoc-gen-python_betterproto.cmd -I C:\Users\victo\Documents\in1144-data-mining-2025-2\libs\protobufs\include\protobufs\pb\proto --python_betterproto_out=C:\Users\victo\Documents\in1144-data-mining-2025-2\proto\generated rc_log.proto
returncode: 0
STDOUT:
 
STDERR:
 Writing RCLog.py
Writing RoboCupSSL.py
Writing __init__.py

‚úÖ Sucesso!


## Base includes

In [8]:
from data_select.data_filter_operator import *
import pandas as pd
import matplotlib.pyplot as plt
from utils.utils import OUTPUT_DIR, data_frame_to_csv, LogFields, gel_2d_length_in_column
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
import scipy.stats as stats
import numpy as np
import math

## Path to Log File

In [9]:
PREFIX_PATH = '../logs/'
LOG_FILE = 'group_phase_tigers_robocin.log.gz'

## Load Log Data

In [None]:
selects = [LogFields.PROCESSED_FRAME, LogFields.REFEREE, LogFields.TELEMETRY, LogFields.ROBOTS_COMMAND]

data_list = load_select_modules(PREFIX_PATH+LOG_FILE, selects)

## Data Analysis

### Robot Analysis
Description ...

In [None]:
raw_robot = raw_frame_extract_robot_data_frame(data_list, True, 2)
robot = processed_frame_extract_robot_data_frame(data_list, True, 6)

print(len(robot), len(raw_robot))

robot['timestamp'] = robot['timestamp'].apply(lambda x: float(x))
raw_robot['timestamp'] = raw_robot['timestamp'].apply(lambda x: float(x))
robot = robot[robot['timestamp'] > 1]

timeref = min(robot['timestamp'].values[0], raw_robot['timestamp'].values[0])

robot['timestamp'] = (robot['timestamp'] - timeref).apply(lambda x: float(x)%1e13/1e9)
raw_robot['timestamp'] = (raw_robot['timestamp'] - timeref).apply(lambda x: float(x)%1e13/1e9)

plt.plot(robot['timestamp'].values,robot['position_w'].values, label='processed')
plt.plot(raw_robot['timestamp'].values,  raw_robot['position_w'].values, label='raw')
plt.legend()
plt.grid()
plt.show()

# Inspecionar `.log.gz` e mostrar colunas

Este notebook tenta **detectar automaticamente** o formato do seu arquivo comprimido (`.log.gz`) e exibir as **colunas** que ele cont√©m.

**O que ele faz:**
1. L√™ algumas linhas de amostra descompactando o arquivo.
2. Detecta se o formato parece **JSON Lines (NDJSON)** ou **CSV/TSV/pipe**.
3. Se for JSON Lines, usa `pandas.read_json(..., lines=True)`; se for CSV-like, tenta inferir o delimitador.
4. Exibe: colunas, tipos (`dtypes`) e primeiras linhas (`head()`).

**Requisitos**: `pandas`.


In [1]:

# üëâ 1) Ajuste aqui o caminho do seu arquivo .log.gz
file_path = r"C:\Users\victo\Documents\in1144-data-mining-2025-2\logs\group_phase_tigers_robocin.log.gz"  # Exemplo no Windows
# file_path = "/caminho/para/seu_arquivo.log.gz"     # Exemplo no Linux/Mac

print("Arquivo configurado:", file_path)


Arquivo configurado: C:\Users\victo\Documents\in1144-data-mining-2025-2\logs\group_phase_tigers_robocin.log.gz


In [2]:

import gzip, io, csv, json, re
import pandas as pd
from typing import List, Optional, Tuple

def read_sample_lines(path: str, max_lines: int = 30) -> List[str]:
    lines = []
    with gzip.open(path, mode="rt", encoding="utf-8", errors="replace", newline="") as f:
        for i, line in enumerate(f):
            if i >= max_lines:
                break
            # guarda apenas linhas n√£o vazias (mas mant√©m linhas curtas)
            if line.strip() != "":
                lines.append(line.rstrip("\n"))
    return lines

def looks_like_json_line(s: str) -> bool:
    s = s.strip()
    # heur√≠stica bem simples: come√ßa com { ou [ e termina plausivelmente
    return (s.startswith("{") and ("}" in s)) or (s.startswith("[") and ("]" in s))

def detect_delimiter(sample_lines: List[str]) -> Optional[str]:
    text = "\n".join(sample_lines[:20])
    # 1) tenta csv.Sniffer
    try:
        dialect = csv.Sniffer().sniff(text, delimiters=[',',';','\t','|',':'])
        return dialect.delimiter
    except Exception:
        pass
    # 2) fallback: escolher o delimitador que mais aparece de forma consistente
    candidates = [',',';','\t','|',':']
    best = None
    best_score = -1
    for delim in candidates:
        counts = [len(l.split(delim)) for l in sample_lines if delim in l]
        if counts:
            # pontua pela mediana do n√∫mero de colunas (quanto maior e consistente, melhor)
            import statistics as stats
            try:
                med = stats.median(counts)
                var = stats.pvariance(counts) if len(counts) > 1 else 0
                score = med - 0.1*var
                if score > best_score:
                    best_score = score
                    best = delim
            except Exception:
                pass
    return best

def quick_preview(path: str, nrows: int = 1000):
    """Tenta carregar um peda√ßo do arquivo como JSON Lines ou CSV-like e retorna (df, info_dict)."""
    sample = read_sample_lines(path, max_lines=50)
    if not sample:
        raise RuntimeError("N√£o foi poss√≠vel ler amostras do arquivo (vazio?).")

    # Decide formato
    is_jsonl = False
    for s in sample:
        if looks_like_json_line(s):
            is_jsonl = True
            break

    info = {"format": "jsonl" if is_jsonl else "csv-like", "delimiter": None, "header_inferido": None}

    if is_jsonl:
        try:
            df = pd.read_json(path, lines=True, compression="gzip", dtype=False)
            # se houver colunas com dict/list, tenta um flatten superficial
            if any(df.applymap(lambda x: isinstance(x, (dict, list))).any(axis=None) for _ in [0]):
                # normaliza apenas uma camada
                records = df.to_dict(orient="records")
                df = pd.json_normalize(records, max_level=1)
            info["header_inferido"] = True  # JSON j√° vem com chaves como colunas
            return df, info
        except Exception as e:
            raise RuntimeError(f"Falha ao ler como JSON Lines: {e}")
    else:
        # CSV-like
        delim = detect_delimiter(sample)
        info["delimiter"] = repr(delim) if delim is not None else None
        # Primeiro, tenta inferir header automaticamente
        try:
            df_try = pd.read_csv(path, compression="gzip", sep=None, engine="python", nrows=nrows)
            info["header_inferido"] = True  # se deu certo, o pandas j√° inferiu algo
            return df_try, info
        except Exception:
            # Segundo, tenta com delimitador detectado
            if delim is not None:
                try:
                    df_try = pd.read_csv(path, compression="gzip", sep=delim, engine="python", nrows=nrows)
                    # Heur√≠stica: se as colunas forem 0..N-1, pode n√£o haver cabe√ßalho
                    if all(isinstance(c, int) for c in df_try.columns):
                        info["header_inferido"] = False
                    else:
                        info["header_inferido"] = True
                    return df_try, info
                except Exception as e2:
                    raise RuntimeError(f"Falha ao ler como CSV com delimitador {delim!r}: {e2}")
            else:
                # Por fim, tenta espa√ßo em branco como separador vari√°vel
                try:
                    df_try = pd.read_csv(path, compression="gzip", delim_whitespace=True, engine="python", nrows=nrows, header=None)
                    info["header_inferido"] = False
                    return df_try, info
                except Exception as e3:
                    raise RuntimeError(f"Falha ao ler como texto delimitado: {e3}")

df, meta = quick_preview(file_path, nrows=2000)
print("Formato detectado:", meta["format"])
print("Delimitador detectado:", meta["delimiter"])
print("Cabe√ßalho inferido:", meta["header_inferido"])
print("\nTotal de colunas detectadas:", len(df.columns))
print("Colunas:", list(map(str, df.columns)))

display(df.head(10))


BadGzipFile: Not a gzipped file (b'\n#')

In [None]:

# (Opcional) Se N√ÉO houver cabe√ßalho e voc√™ souber os nomes das colunas,
# defina-os abaixo e recarregue um peda√ßo para confirmar:
# Exemplo:
# known_cols = ["ts","level","msg","thread","extra"]
# df_named = pd.read_csv(file_path, compression="gzip", sep=",", names=known_cols, header=None, nrows=1000)
# display(df_named.head())
known_cols = None
if known_cols:
    print("Usando nomes fornecidos:", known_cols)


In [None]:

# (Opcional) Dicion√°rio de tipos para economizar mem√≥ria ou corrigir infer√™ncia de tipo
# Exemplo: dtypes = {"ts":"float64", "level":"category"}
dtypes = {}

print("Tipos inferidos pelo pandas:")
display(df.dtypes.to_frame("dtype"))

# Mostra uma amostra aleat√≥ria para verificar a consist√™ncia visual
print("\nAmostra aleat√≥ria:")
display(df.sample(min(len(df), 10), random_state=42))


## Dicas
- Se o seu log for JSON Lines (um JSON por linha), o m√©todo acima j√° mostra as chaves como colunas.
- Se o arquivo for CSV/TSV, mas **sem cabe√ßalho**, voc√™ ver√° colunas num√©ricas (0, 1, 2, ...). Use a c√©lula de *nomes conhecidos* para definir os nomes.
- Se houver campos complexos (listas/dicion√°rios), considere usar `pd.json_normalize` para *flatten* adicional.
- Para carregar o arquivo completo, remova o par√¢metro `nrows` (cuidado com a RAM).
