In [None]:
import os
import pandas as pd
from sqlalchemy import create_engine,MetaData, Table, Column, Integer, String, ForeignKey , Numeric, Date ,TEXT
from dotenv import load_dotenv




In [35]:
load_dotenv()
username=os.getenv("user")
password=os.getenv("psw")
db= os.getenv("db")
host=os.getenv("host")
port=os.getenv("port")
engine=create_engine(f"mysql+pymysql://{username}:{password}@{host}:{port}/{db}")

In [36]:
df=pd.read_csv("tmdb_movies_2021_2025.csv")
summary = pd.DataFrame({
    'dtype': df.dtypes,
    'null_count': df.isnull().sum(),
    'null_percent': (df.isnull().sum() / len(df)) * 100
})
summary

Unnamed: 0,dtype,null_count,null_percent
tmdb_id,int64,0,0.0
title,str,3,0.00129
original_title,str,3,0.00129
release_date,str,0,0.0
genres,str,48549,20.873569
vote_average,float64,0,0.0
vote_count,int64,0,0.0
popularity,float64,0,0.0
original_language,str,0,0.0
overview,str,34941,15.02283


In [37]:
df.dropna(subset=["title","tmdb_id"],inplace =True)
df=df.drop("poster_url",axis=1)
df['genres'] = df['genres'].fillna('Unknown')
df['overview'] = df['overview'].fillna('No overview available')

In [38]:
df["genres_list"] = df["genres"].str.split('|')
df_exploded=df.explode("genres_list").rename(columns={"genres_list":"genre_name"})
unique_genres = df_exploded["genre_name"].dropna().unique()
df_dim_genre = pd.DataFrame({
    'genre_id': range(1, len(unique_genres)+1),
    'genre': unique_genres
})
df_exploded = df_exploded.merge(df_dim_genre, left_on="genre_name", right_on="genre", how="left")

In [39]:
df_dim_movie = df[["tmdb_id","title","original_title","release_date","overview","original_language"]]
df_bridge_movie_genre=df_exploded[["tmdb_id","genre_id"]].drop_duplicates()
df_fact_movies=df[["tmdb_id","vote_average","vote_count","popularity"]]

In [None]:
metadata = MetaData()
dim_movie = Table(
    "dim_movie", metadata,
    Column("tmdb_id", Integer, primary_key=True),
    Column("title", String(250)),
    Column("original_title", String(250)),
    Column("release_date", Date ),
    Column("overview", TEXT),
    Column("original_language", String(5))
)
fact_movie_stats = Table(
    "fact_movie_stats", metadata,
    Column("tmdb_id", Integer, ForeignKey("dim_movie.tmdb_id")),
    Column("vote_average", Numeric(8,4) ),
    Column("vote_count",Integer ),
    Column("popularity", Numeric(8,4) )
)
dim_genre = Table(
    "dim_genre", metadata , 
    Column("genre_id" , Integer , primary_key=True),
    Column("genre" , String(30) )
)
bridge_movie_genre = Table(
    "bridge_movie_genre", metadata , 
    Column("tmdb_id" , Integer, ForeignKey("dim_movie.tmdb_id")),
    Column("genre_id" , Integer , ForeignKey("dim_genre.genre_id") )
)
metadata.create_all(engine)

In [44]:

df_dim_movie.to_sql('dim_movie', con=engine, if_exists='append', index=False)
df_fact_movies.to_sql('fact_movie_stats', con=engine, if_exists='append', index=False)
df_dim_genre.to_sql('dim_genre', con=engine, if_exists='append', index=False)
df_bridge_movie_genre.to_sql('bridge_movie_genre', con=engine, if_exists='append', index=False)

DatabaseError: Execution failed on sql 'INSERT INTO dim_movie (tmdb_id, title, original_title, release_date, overview, original_language) VALUES (:tmdb_id, :title, :original_title, :release_date, :overview, :original_language)': (pymysql.err.IntegrityError) (1062, "Duplicate entry '776797' for key 'dim_movie.PRIMARY'")
[SQL: INSERT INTO dim_movie (tmdb_id, title, original_title, release_date, overview, original_language) VALUES (%(tmdb_id)s, %(title)s, %(original_title)s, %(release_date)s, %(overview)s, %(original_language)s)]
[parameters: [{'tmdb_id': 776797, 'title': 'The Sadness', 'original_title': '哭悲', 'release_date': '2021-01-22', 'overview': 'A young couple is pushed to the limits of sanity as they attempt to be reunited amid the chaos of a pandemic outbreak. The streets erupt into violence and depravity, as those infected are driven to enact the most cruel and ghastly things imaginable.', 'original_language': 'zh'}, {'tmdb_id': 630586, 'title': 'Wrong Turn', 'original_title': 'Wrong Turn', 'release_date': '2021-01-26', 'overview': 'Jen and a group of friends set out to hike the Appalachian Trail. Despite warnings to stick to the trail, the hikers stray off course—and cross into land inhabited by The Foundation, a hidden community of mountain dwellers who use deadly means to protect their way of life.', 'original_language': 'en'}, {'tmdb_id': 602269, 'title': 'The Little Things', 'original_title': 'The Little Things', 'release_date': '2021-01-28', 'overview': 'Deputy Sheriff Joe "Deke" Deacon joins forces with Sgt. Jim Baxter to search for a serial killer who\'s terrorizing Los Angeles. As they track the cu ... (7 characters truncated) ... Baxter is unaware that the investigation is dredging up echoes of Deke\'s past, uncovering disturbing secrets that could threaten more than his case.', 'original_language': 'en'}, {'tmdb_id': 581734, 'title': 'Nomadland', 'original_title': 'Nomadland', 'release_date': '2021-01-29', 'overview': 'A woman in her sixties embarks on a journey through the western United States after losing everything in the Great Recession, living as a van-dwelling modern-day nomad.', 'original_language': 'en'}, {'tmdb_id': 634528, 'title': 'The Marksman', 'original_title': 'The Marksman', 'release_date': '2021-01-15', 'overview': 'Jim Hanson’s quiet life is suddenly disturbed by two people crossing the US/Mexico border – a woman and her young son – desperate to flee a Mexican c ... (83 characters truncated) ... ender. He embraces his role as Miguel’s protector and will stop at nothing to get him to safety, as they go on the run from the relentless assassins.', 'original_language': 'en'}, {'tmdb_id': 550205, 'title': 'Wish Dragon', 'original_title': 'Wish Dragon', 'release_date': '2021-01-15', 'overview': 'Determined teen Din is longing to reconnect with his childhood best friend when he meets a wish-granting dragon who shows him the magic of possibilities.', 'original_language': 'en'}, {'tmdb_id': 587996, 'title': 'Below Zero', 'original_title': 'Bajocero', 'release_date': '2021-01-29', 'overview': 'When a prisoner transfer van is attacked, the cop in charge must fight those inside and outside while dealing with a silent foe: the icy temperatures.', 'original_language': 'es'}, {'tmdb_id': 586101, 'title': 'Music', 'original_title': 'Music', 'release_date': '2021-01-14', 'overview': 'Zu, a free spirit estranged from her family, suddenly finds herself the sole guardian of her half-sister, Music, a teenager on the autism spectrum wh ... (92 characters truncated) ... allenges whether it is Zu or Music who has a better view of the world, and that love, trust, and being able to be there for each other is everything.', 'original_language': 'en'}  ... displaying 10 of 232583 total bound parameter sets ...  {'tmdb_id': 1366585, 'title': 'The "Set-Up" Boxer', 'original_title': 'الملاكم المُعدّ', 'release_date': '2025-12-15', 'overview': "Tired of being the punching bag of Alexandria's worst boxers, A young boxer decides to confront a money collector that works in the corrupted betting business in the underground boxing championships.", 'original_language': 'ar'}, {'tmdb_id': 1330029, 'title': 'Shared Bodies', 'original_title': 'Shared Bodies', 'release_date': '2025-12-01', 'overview': 'No overview available', 'original_language': 'en'}]]
(Background on this error at: https://sqlalche.me/e/20/gkpj)