In [1]:
import pandas as pd
import langchain
import faiss
import openai
import streamlit as st
import numpy as np
import logging

In [2]:
names = pd.read_table("/Users/jhajsidhu/Downloads/imdb_data/name.basics.tsv", low_memory = False, encoding_errors = "ignore")
basics = pd.read_table("/Users/jhajsidhu/Downloads/imdb_data/title.basics.tsv", low_memory = False, encoding_errors = "ignore")
crews = pd.read_table("/Users/jhajsidhu/Downloads/imdb_data/title.crew.tsv", low_memory = False, encoding_errors = "ignore")
principals = pd.read_table("/Users/jhajsidhu/Downloads/imdb_data/title.principals.tsv", low_memory = False, encoding_errors = "ignore")
ratings = pd.read_table("/Users/jhajsidhu/Downloads/imdb_data/title.ratings.tsv", low_memory = False, encoding_errors = "ignore")

In [3]:
names = names.drop_duplicates(subset = "nconst")
basics = basics.drop_duplicates(subset = "tconst")
crews = crews.drop_duplicates(subset = "tconst")
principals = principals.drop_duplicates(subset = "tconst")
ratings = ratings.drop_duplicates(subset = "tconst")

In [4]:
names = names.replace("\\N", np.nan)
names_columns = ["nconst", "primaryName", "birthYear", "deathYear", "primaryProfession"]
names = names[names_columns]
names["birthYear"] = pd.to_numeric(names["birthYear"], errors = "coerce").astype("Int64")
names["deathYear"] = pd.to_numeric(names["deathYear"], errors = "coerce").astype("Int64")
names = names.dropna(subset = ["nconst", "primaryName", "birthYear", "deathYear"])
print(names.head())

      nconst     primaryName  birthYear  deathYear  \
0  nm0000001    Fred Astaire       1899       1987   
1  nm0000002   Lauren Bacall       1924       2014   
3  nm0000004    John Belushi       1949       1982   
4  nm0000005  Ingmar Bergman       1918       2007   
5  nm0000006  Ingrid Bergman       1915       1982   

                    primaryProfession  
0        actor,miscellaneous,producer  
1  actress,soundtrack,archive_footage  
3       actor,writer,music_department  
4               writer,director,actor  
5         actress,producer,soundtrack  


In [5]:
basics = basics.replace("\\N", np.nan)
basics_columns = ["tconst", "primaryTitle", "startYear", "genres"]
basics = basics[basics_columns]
basics["startYear"] = pd.to_numeric(basics["startYear"], errors = "coerce").astype("Int64")
basics = basics.dropna(subset = ["tconst", "primaryTitle", "startYear"])
print(basics.head())

      tconst            primaryTitle  startYear                    genres
0  tt0000001              Carmencita       1894         Documentary,Short
1  tt0000002  Le clown et ses chiens       1892           Animation,Short
2  tt0000003            Poor Pierrot       1892  Animation,Comedy,Romance
3  tt0000004             Un bon bock       1892           Animation,Short
4  tt0000005        Blacksmith Scene       1893                     Short


In [6]:
crews = crews.replace("\\N", np.nan)
crews_columns = ["tconst", "directors", "writers"]
crews = crews[crews_columns]
crews = crews.dropna(subset = ["directors", "writers"], how = "all")
crews = crews.dropna(subset = ["tconst"])
print(crews.head())

      tconst  directors    writers
0  tt0000001  nm0005690        NaN
1  tt0000002  nm0721526        NaN
2  tt0000003  nm0721526  nm0721526
3  tt0000004  nm0721526        NaN
4  tt0000005  nm0005690        NaN


In [7]:
principals = principals.replace("\\N", np.nan)
principals_columns = ["tconst", "ordering", "nconst", "category", "job", "characters"]
principals = principals[principals_columns]
principals = principals.dropna(subset = ["tconst", "category"])
print(principals.head())

       tconst  ordering     nconst  category  job      characters
0   tt0000001         1  nm1588970      self  NaN        ["Self"]
4   tt0000002         1  nm0721526  director  NaN             NaN
6   tt0000003         1  nm0721526  director  NaN             NaN
12  tt0000004         1  nm0721526  director  NaN             NaN
14  tt0000005         1  nm0443482     actor  NaN  ["Blacksmith"]


In [8]:
ratings = ratings.replace("\\N", np.nan)
ratings_columns = ["tconst", "averageRating", "numVotes"]
ratings = ratings[ratings_columns]
ratings["numVotes"] = pd.to_numeric(ratings["numVotes"], errors = "coerce").astype("Int64")
ratings = ratings.dropna(subset = ["tconst", "averageRating", "numVotes"])
print(ratings.head())

      tconst  averageRating  numVotes
0  tt0000001            5.7      2163
1  tt0000002            5.5       296
2  tt0000003            6.5      2217
3  tt0000004            5.3       189
4  tt0000005            6.2      2955


In [12]:
movies = pd.merge(basics, ratings, on = "tconst", how = "left")
movies = pd.merge(movies, crews, on = "tconst", how = "left")
get_name = dict(zip(names["nconst"], names["primaryName"]))
movies["directorNames"] = movies["directors"].map(get_name)
movies["writerNames"] = movies["writers"].map(get_name)
movies_columns = ["tconst", "primaryTitle", "startYear", "genres", "averageRating", "numVotes", "directorNames", "writerNames"]
movies = movies[movies_columns]
cast = principals[principals["category"].isin(["actor", "actress"])]
cast = cast.merge(names[["nconst", "primaryName"]], on = "nconst", how = "left")
cast["ordering"] = cast["ordering"].astype(int)
top_cast = (cast.sort_values(["tconst", "ordering"]).groupby("tconst").head(5).groupby("tconst")["primaryName"].apply(lambda names_list: ", ".join(str(name) for name in names_list if isinstance(name, str) and name.strip())).reset_index().rename(columns = {"primaryName": "cast"}))
movies = movies.merge(top_cast, on = "tconst", how = "left")
print(movies.head())

      tconst            primaryTitle  startYear                    genres  \
0  tt0000001              Carmencita       1894         Documentary,Short   
1  tt0000002  Le clown et ses chiens       1892           Animation,Short   
2  tt0000003            Poor Pierrot       1892  Animation,Comedy,Romance   
3  tt0000004             Un bon bock       1892           Animation,Short   
4  tt0000005        Blacksmith Scene       1893                     Short   

   averageRating  numVotes         directorNames    writerNames  \
0            5.7      2163  William K.L. Dickson            NaN   
1            5.5       296         Émile Reynaud            NaN   
2            6.5      2217         Émile Reynaud  Émile Reynaud   
3            5.3       189         Émile Reynaud            NaN   
4            6.2      2955  William K.L. Dickson            NaN   

             cast  
0             NaN  
1             NaN  
2             NaN  
3             NaN  
4  Charles Kayser  


In [14]:
movies["genre_list"] = movies["genres"].str.split(",")
movies_sorted = movies.explode("genre_list").rename(columns = {"genre_list": "genre"})
movies_sorted["genre"] = movies_sorted["genre"].str.strip()
movies_sorted = movies_sorted[movies_sorted["genre"] != ""]
print(movies_sorted[["primaryTitle", "genre"]].head())

             primaryTitle        genre
0              Carmencita  Documentary
0              Carmencita        Short
1  Le clown et ses chiens    Animation
1  Le clown et ses chiens        Short
2            Poor Pierrot    Animation
