In [None]:
import pandas as pd
import os
from pathlib import Path
from datetime import datetime

# Configura√ß√£o do diret√≥rio
base_dir = Path('database_50k')
csv_files = sorted([f for f in os.listdir(base_dir) if f.endswith('.csv')])

print(f"Encontrados {len(csv_files)} arquivos CSV:")
for i, file in enumerate(csv_files, 1):
    print(f"{i}. {file}")


Encontrados 9 arquivos CSV:
1. authorships.csv
2. citations.csv
3. concepts.csv
4. keywords.csv
5. related_works.csv
6. topics.csv
7. unique_authors_metadata.csv
8. unique_institutions_metadata.csv
9. works.csv


In [2]:
# Fun√ß√£o para analisar cada arquivo CSV
def analyze_csv_file(filepath):
    """Analisa um arquivo CSV e retorna informa√ß√µes sobre colunas e linhas"""
    try:
        # L√™ o arquivo CSV
        df = pd.read_csv(filepath, low_memory=False)
        
        # Informa√ß√µes b√°sicas
        total_rows = len(df)
        total_cols = len(df.columns)
        
        # Primeiras 2 linhas
        first_2_rows = df.head(2)
        
        # √öltimas 2 linhas
        last_2_rows = df.tail(2)
        
        return {
            'total_rows': total_rows,
            'total_cols': total_cols,
            'columns': list(df.columns),
            'first_2_rows': first_2_rows,
            'last_2_rows': last_2_rows,
            'success': True
        }
    except Exception as e:
        return {
            'error': str(e),
            'success': False
        }

# Dicion√°rio para armazenar resultados
results = {}


In [3]:
# Analisa cada arquivo CSV
for csv_file in csv_files:
    filepath = base_dir / csv_file
    print(f"\n{'='*80}")
    print(f"Analisando: {csv_file}")
    print(f"{'='*80}")
    
    result = analyze_csv_file(filepath)
    results[csv_file] = result
    
    if result['success']:
        print(f"\n‚úì Total de linhas: {result['total_rows']:,}")
        print(f"‚úì Total de colunas: {result['total_cols']}")
        print(f"\nüìã Colunas ({result['total_cols']}):")
        for i, col in enumerate(result['columns'], 1):
            print(f"   {i:2d}. {col}")
        
        print(f"\nüìÑ Primeiras 2 linhas:")
        print(result['first_2_rows'].to_string())
        
        print(f"\nüìÑ √öltimas 2 linhas:")
        print(result['last_2_rows'].to_string())
    else:
        print(f"\n‚úó Erro ao processar: {result['error']}")
    
    print(f"\n{'-'*80}\n")



Analisando: authorships.csv

‚úì Total de linhas: 189,283
‚úì Total de colunas: 7

üìã Colunas (7):
    1. work_id
    2. author_id
    3. author_name
    4. author_position
    5. is_corresponding
    6. countries
    7. institution_ids

üìÑ Primeiras 2 linhas:
                            work_id                         author_id      author_name author_position  is_corresponding countries                   institution_ids
0  https://openalex.org/W4399913519  https://openalex.org/A5103275826  Gabrielle Alves           first              True        BR    https://openalex.org/I17974374
1  https://openalex.org/W4399913519  https://openalex.org/A5005545405  Dietmar Jannach          middle             False        AT  https://openalex.org/I4210166741

üìÑ √öltimas 2 linhas:
                                 work_id                         author_id    author_name author_position  is_corresponding countries institution_ids
189281  https://openalex.org/W4417299774  https://openalex.org/A

In [4]:
# Gera log formatado para documenta√ß√£o
log_lines = []
log_lines.append("="*80)
log_lines.append("AN√ÅLISE DOS ARQUIVOS CSV - database_50k")
log_lines.append(f"Data/Hora: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
log_lines.append("="*80)
log_lines.append("")

for csv_file in csv_files:
    result = results[csv_file]
    
    if result['success']:
        log_lines.append(f"\n{'='*80}")
        log_lines.append(f"ARQUIVO: {csv_file}")
        log_lines.append(f"{'='*80}")
        log_lines.append(f"Total de linhas: {result['total_rows']:,}")
        log_lines.append(f"Total de colunas: {result['total_cols']}")
        log_lines.append(f"\nCOLUNAS ({result['total_cols']}):")
        for i, col in enumerate(result['columns'], 1):
            log_lines.append(f"  {i:2d}. {col}")
        
        log_lines.append(f"\nPRIMEIRAS 2 LINHAS:")
        log_lines.append(result['first_2_rows'].to_string())
        
        log_lines.append(f"\n√öLTIMAS 2 LINHAS:")
        log_lines.append(result['last_2_rows'].to_string())
        log_lines.append("")
    else:
        log_lines.append(f"\n{'='*80}")
        log_lines.append(f"ARQUIVO: {csv_file}")
        log_lines.append(f"ERRO: {result['error']}")
        log_lines.append("")

# Exibe o log
log_text = "\n".join(log_lines)
print(log_text)


AN√ÅLISE DOS ARQUIVOS CSV - database_50k
Data/Hora: 2026-02-19 10:00:19


ARQUIVO: authorships.csv
Total de linhas: 189,283
Total de colunas: 7

COLUNAS (7):
   1. work_id
   2. author_id
   3. author_name
   4. author_position
   5. is_corresponding
   6. countries
   7. institution_ids

PRIMEIRAS 2 LINHAS:
                            work_id                         author_id      author_name author_position  is_corresponding countries                   institution_ids
0  https://openalex.org/W4399913519  https://openalex.org/A5103275826  Gabrielle Alves           first              True        BR    https://openalex.org/I17974374
1  https://openalex.org/W4399913519  https://openalex.org/A5005545405  Dietmar Jannach          middle             False        AT  https://openalex.org/I4210166741

√öLTIMAS 2 LINHAS:
                                 work_id                         author_id    author_name author_position  is_corresponding countries institution_ids
189281  https://openalex.

In [5]:
# Salva o log em um arquivo de texto
log_file = Path('database_50k_analysis_log.txt')
with open(log_file, 'w', encoding='utf-8') as f:
    f.write(log_text)

print(f"\n‚úì Log salvo em: {log_file.absolute()}")



‚úì Log salvo em: c:\Users\BSBCo\dev\link-prediction\database_50k_analysis_log.txt


In [6]:
# Resumo estat√≠stico
print("\n" + "="*80)
print("RESUMO ESTAT√çSTICO")
print("="*80)
print(f"{'Arquivo':<40} {'Linhas':>12} {'Colunas':>8}")
print("-"*80)

for csv_file in csv_files:
    result = results[csv_file]
    if result['success']:
        print(f"{csv_file:<40} {result['total_rows']:>12,} {result['total_cols']:>8}")

total_rows_all = sum(r['total_rows'] for r in results.values() if r.get('success', False))
print("-"*80)
print(f"{'TOTAL':<40} {total_rows_all:>12,}")
print("="*80)



RESUMO ESTAT√çSTICO
Arquivo                                        Linhas  Colunas
--------------------------------------------------------------------------------
authorships.csv                               189,283        7
citations.csv                                 871,345        2
concepts.csv                                  505,255        6
keywords.csv                                  396,817        4
related_works.csv                             458,180        2
topics.csv                                    104,879        4
unique_authors_metadata.csv                    49,945        8
unique_institutions_metadata.csv                6,063        6
works.csv                                      40,302       17
--------------------------------------------------------------------------------
TOTAL                                       2,622,069
