In [1]:
import pandas as pd

# Load dataset as a pandas dataframe
data = pd.read_csv("imdb_tv_series.csv")
data.head()

Unnamed: 0,Title,Year,Runtime_minutes,Genres,Rating,Votes,IMDB_Link
0,Sugarland,(2019– ),,"Animation, Comedy, Family",9.9,33276,https://www.imdb.com/title/tt4057372/
1,The Chosen,(2017– ),54.0,"Drama, History",9.5,23988,https://www.imdb.com/title/tt9471404/
2,Planet Earth II,(2016),298.0,Documentary,9.5,110279,https://www.imdb.com/title/tt5491994/
3,Chernobyl,(2019),330.0,"Drama, History, Thriller",9.4,653836,https://www.imdb.com/title/tt7366338/
4,The Filthy Frank Show,(2011–2017),12.0,"Comedy, Fantasy, Music",9.4,32908,https://www.imdb.com/title/tt4202274/


In [2]:
data.columns

Index(['Title', 'Year', 'Runtime_minutes', 'Genres', 'Rating', 'Votes',
       'IMDB_Link'],
      dtype='object')

In [3]:
# Create a new dataframe with the relevant columns
df = data[["Genres", "Title", "Rating", "IMDB_Link"]].copy()
df.head()

Unnamed: 0,Genres,Title,Rating,IMDB_Link
0,"Animation, Comedy, Family",Sugarland,9.9,https://www.imdb.com/title/tt4057372/
1,"Drama, History",The Chosen,9.5,https://www.imdb.com/title/tt9471404/
2,Documentary,Planet Earth II,9.5,https://www.imdb.com/title/tt5491994/
3,"Drama, History, Thriller",Chernobyl,9.4,https://www.imdb.com/title/tt7366338/
4,"Comedy, Fantasy, Music",The Filthy Frank Show,9.4,https://www.imdb.com/title/tt4202274/


In [4]:
# sort genres alphabetically
all_genres = df["Genres"].str.split(",").tolist()
genres = sorted(list(set([item for sublist in all_genres for item in sublist])))
genres = [g.strip() for g in genres]
# remove duplicate genres
genres = sorted(list(set(dict.fromkeys(genres))))
genres

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western']

In [5]:
# Number of genres
len(genres)

26

In [6]:
df.head()

Unnamed: 0,Genres,Title,Rating,IMDB_Link
0,"Animation, Comedy, Family",Sugarland,9.9,https://www.imdb.com/title/tt4057372/
1,"Drama, History",The Chosen,9.5,https://www.imdb.com/title/tt9471404/
2,Documentary,Planet Earth II,9.5,https://www.imdb.com/title/tt5491994/
3,"Drama, History, Thriller",Chernobyl,9.4,https://www.imdb.com/title/tt7366338/
4,"Comedy, Fantasy, Music",The Filthy Frank Show,9.4,https://www.imdb.com/title/tt4202274/


In [7]:
# Initialize lists to store the tv data and the titles
final_data = []
tv_titles = []

In [8]:
print(df)

                                     Genres                  Title  Rating  \
0     Animation, Comedy, Family                          Sugarland     9.9   
1                Drama, History                         The Chosen     9.5   
2                   Documentary                    Planet Earth II     9.5   
3      Drama, History, Thriller                          Chernobyl     9.4   
4        Comedy, Fantasy, Music              The Filthy Frank Show     9.4   
...                                     ...                    ...     ...   
1414   Action, Adventure, Drama                    Y: The Last Man     6.0   
1415   Action, Adventure, Crime                        Magnum P.I.     6.0   
1416      Comedy, Drama, Family                      Austin & Ally     6.0   
1417      Crime, Drama, Fantasy                     The Irregulars     6.0   
1418            Comedy, Romance                               Joey     6.0   

                                   IMDB_Link  
0      https://w

In [9]:
for i in df.index:
    # Append the tv_title and the index of the tv show to the tv_titles list
    tv_titles.append((df.loc[i]["Title"].strip(), i, df.loc[i]["IMDB_Link"].strip()))
    # split by comma and One hot encode the Genres column, 1 if the genre is present in the tv show, 0 otherwise
    genres_list = df.loc[i]["Genres"].split(",")
    genres_list = [g.strip() for g in genres_list]
    genres_list = [1 if g in genres_list else 0 for g in genres]
    # Add IMDB scores to the list
    genres_list.append(df.loc[i]["Rating"])
    final_data.append(genres_list)

In [12]:
# Let's see what the tv titles look like
print(tv_titles[0])

('Sugarland', 0, 'https://www.imdb.com/title/tt4057372/')


In [13]:
# Let's see what the final data looks like
print(final_data[0])
# 0's indicate that the tv show is not in that genre, 1's indicate that the tv show is in that genre, and the last column is the IMDb score.

[0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9.9]


In [14]:
# Now let's dump the data to json files
import json

data_dump = "./Data/tv_data.json"
titles_dump = "./Data/tv_titles.json"

with open(data_dump, "w+", encoding="utf-8") as f:
    json.dump(final_data, f)
with open(titles_dump, "w+", encoding="utf-8") as f:
    json.dump(tv_titles, f)