# Desafio Light House

## 1 - Bibliotecas e dados

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as  plt
import sklearn
import wikipedia
import wikipediaapi
import requests
import csv
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
df = pd.read_csv('desafio_indicium_imdb.csv')

In [37]:
df.head()

Unnamed: 0.1,Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,Released_year_num
0,1,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411,1972.0
1,2,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444,2008.0
2,3,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000,1974.0
3,4,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000,1957.0
4,5,The Lord of the Rings: The Return of the King,2003,U,201 min,"Action, Adventure, Drama",8.9,Gandalf and Aragorn lead the World of Men agai...,94.0,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905,2003.0


## 2 - Adicionando Dados Externos

Alguns dados que não foram fornecidos nessa base inicial me parecem importantes:

- Custo da produção (Budget)
- Data de lançamento (mês e dia)
O primeiro vai ajudar a entender como o ROI se comporta, e o segundo pode mostrar alguma sazonalidade (filmes de romance no dia dos namorados, filmes infantis nas férias escolares, filmes de terror no halloween etc)

A lista parece ser o top 1000 do IMDB, sem o top 1 que foi retirado como exemplo para o tópico 5 do desafio, infelizmente a API do IMDB é paga desde 2023, então vou recorrer ao TMDB que tem uma API pública
Vou usar essa API para coletar o maximo de dabos possíveis e ver o que posso extrair para incrementar a base fornecida

In [None]:
api_key = 'ed7763b268797ae60379d1098090246a'
base_url = 'https://api.themoviedb.org/3/search/movie'
output_file = 'tmdb_filmes_encontrados_parallel.csv'
max_workers = 5  
timeout = 5      

df.columns = df.columns.str.strip()
titles = df['Series_Title'].tolist()

header_written = False
fieldnames = None

def fetch_movie(title):
    retries = 0
    wait = 1
    while retries < 5:
        try:
            params = {
                'api_key': api_key,
                'query': title,
                'with_original_language': 'en'
            }
            response = requests.get(base_url, params=params, timeout=timeout)
            data = response.json()
            
            if 'results' in data and len(data['results']) > 0:
                return data['results'][0]  
            else:
                return None
        except Exception as e:
            retries += 1
            time.sleep(wait)
            wait *= 2  # backoff exponencial para evitar sobrecarga
    return None

def save_to_csv(movie):
    global header_written, fieldnames
    if movie:
        if not header_written:
            fieldnames = movie.keys()
            with open(output_file, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
            header_written = True
        with open(output_file, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writerow(movie)

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_title = {executor.submit(fetch_movie, title): title for title in titles}
    for future in as_completed(future_to_title):
        movie = future.result()
        save_to_csv(movie)
        print(f"Salvo: {movie['title'] if movie else 'Não encontrado'}")


Salvo: The Godfather
Salvo: The Lord of the Rings: The Return of the King
Salvo: The Godfather Part II
Salvo: The Dark Knight
Salvo: 12 Angry Men
Salvo: Pulp Fiction
Salvo: Inception
Salvo: Schindler's List
Salvo: The Lord of the Rings: The Fellowship of the Ring
Salvo: Fight Club
Salvo: Forrest Gump
Salvo: The Good, the Bad and the Ugly
Salvo: The Lord of the Rings: The Two Towers
Salvo: GoodFellas
Salvo: The Matrix
Salvo: The Empire Strikes Back
Salvo: One Flew Over the Cuckoo's Nest
Salvo: Parasite
Salvo: Hamilton
Salvo: Soorarai Pottru
Salvo: Interstellar
Salvo: Spirited Away
Salvo: City of God
Salvo: Saving Private Ryan
Salvo: The Green Mile
Salvo: Life Is Beautiful
Salvo: Se7en
Salvo: The Silence of the Lambs
Salvo: Star Wars
Salvo: Seppuku
Salvo: It's a Wonderful Life
Salvo: Joker
Salvo: Seven Samurai
Salvo: Whiplash
Salvo: The Intouchables
Salvo: The Departed
Salvo: The Prestige
Salvo: The Pianist
Salvo: Gladiator
Salvo: American History X
Salvo: Léon: The Professional
Salvo: T