In [None]:
import sys
import os
import pandas as pd

print("Current working directory:", os.getcwd())


src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.append(src_path)


print(f"Source path: {src_path}")
print("Files in src directory:", os.listdir(src_path))  
print("Current sys.path:", sys.path)  

data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'movies_metadata.csv'))
print(f"Movies CSV path: {data_path}")

if not os.path.exists(data_path):
    raise FileNotFoundError(f"The file {data_path} does not exist.")
if os.path.getsize(data_path) == 0:
    raise ValueError(f"The file {data_path} is empty.")

try:
    test_df = pd.read_csv(data_path, on_bad_lines='skip', low_memory=False)  
    print("CSV file loaded successfully. Preview:")
    print(test_df.head())
except Exception as e:
    print(f"Error reading CSV file directly: {e}")

try:
    from data_loading import load_data
    print("Load data function imported successfully.")

    from data_cleaning import clean_data
    from data_analysis import analyze_genres, analyze_ratings_over_time, analyze_budget_revenue_correlation
    from visualization import plot_genres, plot_ratings_over_time, plot_budget_revenue

    df = load_data(data_path)
    print("Data loaded using load_data function. Dataframe columns:")
    print(df.columns)
    print("First few rows of the dataframe:")
    print(df.head())

    if 'release_date' not in df.columns:
        raise KeyError("The column 'release_date' is not present in the dataframe")

    df = clean_data(df)
    print("Data cleaned. First few rows of the cleaned dataframe:")
    print(df.head())

    genres = analyze_genres(df)
    print("Top genres:")
    print(genres)
    plot_genres(genres)

    ratings_over_time = analyze_ratings_over_time(df)
    print("Ratings over time:")
    print(ratings_over_time)
    plot_ratings_over_time(ratings_over_time)

    correlation = analyze_budget_revenue_correlation(df)
    print("Correlation between budget and revenue:")
    print(correlation)
    plot_budget_revenue(df)

except ModuleNotFoundError as e:
    print(f"ModuleNotFoundError: {e}")
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
print("Resumimos os seguintes aspectos do dataset de filmes:")
print("- Gêneros mais populares")
print("- Evolução das notas ao longo do tempo")
print("- Correlação entre orçamento e receita")

import matplotlib.pyplot as plt
import seaborn as sns

sns.barplot(x=genres.index, y=genres.values)
plt.title('Gêneros Mais Populares')
plt.xticks(rotation=90)
plt.show()

ratings_over_time.plot(title='Evolução das Notas ao Longo do Tempo')
plt.show()

sns.scatterplot(data=df, x='budget', y='revenue')
plt.title('Correlação entre Orçamento e Receita')
plt.show()

plot_genres(genres)
plot_ratings_over_time(ratings_over_time)
plot_budget_revenue(df)
