## Carregando pacotes necessários e criando um arquivo para o banco de dados SQLite

In [12]:
import os
import polars as pl
import sqlite3
import pandas as pd
import gc

In [13]:
conn = sqlite3.connect("IMDB.db")

## Adicionando as tabelas "basics", "ratings", "principals" à base de dados

### Basics (11144943 linhas - verificado via terminal)

In [10]:
# Configurações
chunk_size = 200000
db_file = "IMDB.db"
table_name = "basics"
tsv_file = "title.basics0.tsv"

conn = sqlite3.connect(db_file)

def process_large_tsv_with_polars(file_path, chunk_size, conn, table_name):
    # Lê o schema tratando \N como valores nulos
    schema = pl.read_csv(
        file_path, 
        separator='\t', 
        n_rows=1000,
        null_values=['\\N'],
        infer_schema_length=1000
    ).schema
    
    total_rows = 0
    skip_rows = 1
    
    while True:
        try:
            # Lê um chunk tratando \N como nulos
            chunk = pl.read_csv(
                file_path,
                separator='\t',
                skip_rows=skip_rows,
                n_rows=chunk_size,
                has_header=False,
                null_values=['\\N'],
                ignore_errors=True,
                new_columns=list(schema.keys()),
                dtypes=schema
            )
            
            if chunk.height == 0:
                break
                
            # Converte e salva no banco
            chunk.to_pandas().to_sql(
                table_name,
                conn,
                if_exists='append' if skip_rows > 1 else 'replace',
                index=False
            )
            
            total_rows += chunk.height
            skip_rows += chunk_size
            
            import gc
            gc.collect()
            
        except Exception as e:
            print(f"Erro no chunk: {e}")
            break
    
    return total_rows

total = process_large_tsv_with_polars(tsv_file, chunk_size, conn, table_name)

# Verificação final
count_result = pl.read_database(f"SELECT COUNT(*) as count FROM {table_name}", conn)
print(f"Registros na tabela: {count_result['count'][0]}")

  chunk = pl.read_csv(


Erro no chunk: empty CSV
Registros na tabela: 11144942


In [14]:
teste = pd.read_sql_query("SELECT COUNT(*) FROM basics", conn)
print(teste)

   COUNT(*)
0  11144942


### Ratings (1484616 linhas)

In [16]:
conn.execute("DROP TABLE IF EXISTS ratings")

chunk_size = 200000  # 200k

def process_ratings_chunk(file_path, chunk_size, conn, table_name):
    # Lê o schema tratando \N como valores nulos
    schema = pl.read_csv(
        file_path, 
        separator='\t', 
        n_rows=1000,
        null_values=['\\N'],
        infer_schema_length=1000
    ).schema
    
    total_rows = 0
    skip_rows = 1
    
    while True:
        try:
            # Lê um chunk tratando \N como nulos
            chunk = pl.read_csv(
                file_path,
                separator='\t',
                skip_rows=skip_rows,
                n_rows=chunk_size,
                has_header=False,
                null_values=['\\N'],
                ignore_errors=True,
                new_columns=list(schema.keys()),
                schema_overrides=schema
            )
            
            if chunk.height == 0:
                break
                
            # Converte e salva no banco
            chunk.to_pandas().to_sql(
                table_name,
                conn,
                if_exists='append' if skip_rows > 1 else 'replace',
                index=False
            )
            
            total_rows += chunk.height
            skip_rows += chunk_size
            
            import gc
            gc.collect()
            
        except Exception as e:
            print(f"Erro no chunk: {e}")
            break
    
    return total_rows

# Processa o arquivo de ratings
total = process_ratings_chunk("title.ratings.tsv", chunk_size, conn, "ratings")

# Verifica a contagem
count_result = pl.read_database("SELECT COUNT(*) as count FROM ratings", conn)
print(f"Registros na tabela ratings: {count_result['count'][0]}")

# Lista as colunas da tabela
columns_result = pl.read_database("PRAGMA table_info(ratings)", conn)
print("Colunas da tabela ratings:")
print(columns_result)

Erro no chunk: empty CSV
Registros na tabela ratings: 1484615
Colunas da tabela ratings:
shape: (3, 6)
┌─────┬───────────────┬─────────┬─────────┬────────────┬─────┐
│ cid ┆ name          ┆ type    ┆ notnull ┆ dflt_value ┆ pk  │
│ --- ┆ ---           ┆ ---     ┆ ---     ┆ ---        ┆ --- │
│ i64 ┆ str           ┆ str     ┆ i64     ┆ null       ┆ i64 │
╞═════╪═══════════════╪═════════╪═════════╪════════════╪═════╡
│ 0   ┆ tconst        ┆ TEXT    ┆ 0       ┆ null       ┆ 0   │
│ 1   ┆ averageRating ┆ REAL    ┆ 0       ┆ null       ┆ 0   │
│ 2   ┆ numVotes      ┆ INTEGER ┆ 0       ┆ null       ┆ 0   │
└─────┴───────────────┴─────────┴─────────┴────────────┴─────┘


Logo, processamento realizado com sucesso.

### Principals (88.359.623 linhas)

In [17]:
chunk_size = 200000  

def process_principals_chunk(file_path, chunk_size, conn, table_name):
    # Lê o schema tratando \N como valores nulos
    schema = pl.read_csv(
        file_path, 
        separator='\t', 
        n_rows=1000,
        null_values=['\\N'],
        infer_schema_length=1000
    ).schema
    
    total_rows = 0
    skip_rows = 1
    
    while True:
        try:
            # Lê um chunk tratando \N como nulos
            chunk = pl.read_csv(
                file_path,
                separator='\t',
                skip_rows=skip_rows,
                n_rows=chunk_size,
                has_header=False,
                null_values=['\\N'],
                ignore_errors=True,
                new_columns=list(schema.keys()),
                schema_overrides=schema
            )
            
            if chunk.height == 0:
                break
                
            # Converte e salva no banco
            chunk.to_pandas().to_sql(
                table_name,
                conn,
                if_exists='append' if skip_rows > 1 else 'replace',
                index=False
            )
            
            total_rows += chunk.height
            skip_rows += chunk_size
            
            import gc
            gc.collect()
            
        except Exception as e:
            print(f"Erro no chunk: {e}")
            break
    
    return total_rows

# Processa o arquivo de principals
total = process_principals_chunk("title.principals0.tsv", chunk_size, conn, "principals")

# Verifica a contagem
count_result = pl.read_database("SELECT COUNT(*) as count FROM principals", conn)
print(f"Registros na tabela principals: {count_result['count'][0]}")

Erro no chunk: empty CSV
Registros na tabela principals: 88359622


In [18]:
conn.close()