In [1]:
import pandas as pd
import os
from IPython.display import display, HTML
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
TMDB_filename = os.path.join(os.getcwd(), "TMDB_tv_dataset_v3.csv")
df = pd.read_csv(TMDB_filename)

family, life, love, young, school, man, girl, father

In [3]:
# filter rows where the word "family" appears in the 'cleaned_overview' column
family_shows = df[df['cleaned_overview'].str.contains(r'\bfamily\b', case=False, na=False)]

html_output = family_shows[['name', 'popularity']].to_html()
display(HTML(f'<div style="max-height: 300px; overflow-y: scroll;">{html_output}</div>'))

Unnamed: 0,name,popularity
0,Game of Thrones,1083.917
3,The Walking Dead,489.746
7,Breaking Bad,247.632
15,The Simpsons,439.488
17,The Umbrella Academy,51.859
18,Peaky Blinders,344.477
21,Rick and Morty,429.047
26,Wednesday,164.275
30,Vikings,294.196
31,Dark,86.861


In [7]:
# create a new feature 'has_family' that is 1 if 'family' is in the overview, else 0
df['has_family'] = df['cleaned_overview'].fillna('').str.contains(r'\bfamily\b', case=False, na=False).astype(int)

print(df[['cleaned_overview', 'has_family']].head())

                                    cleaned_overview  has_family
0  seven noble family fight control mythical land...           1
1  carry biggest heist history mysterious man cal...           0
2  young boy vanishes small town uncovers mystery...           0
3  sheriff deputy rick grime awakens coma find po...           1
4  bored unhappy lord hell lucifer morningstar ab...           0


In [4]:
life_shows = df[df['cleaned_overview'].str.contains(r'\blife\b', case=False, na=False)]

html_output = life_shows[['name', 'popularity']].to_html()
display(HTML(f'<div style="max-height: 300px; overflow-y: scroll;">{html_output}</div>'))

Unnamed: 0,name,popularity
5,Riverdale,143.75
8,The Good Doctor,681.614
10,The Flash,319.767
13,Grey's Anatomy,1647.218
15,The Simpsons,439.488
21,Rick and Morty,429.047
27,Friends,214.333
28,Supernatural,439.934
33,Cobra Kai,118.256
39,Naruto,135.573


In [10]:
df['has_life'] = df['cleaned_overview'].fillna('').str.contains(r'\blife\b', case=False, na=False).astype(int)

print(df[['cleaned_overview', 'has_life']].head())

                                    cleaned_overview  has_life
0  seven noble family fight control mythical land...         0
1  carry biggest heist history mysterious man cal...         0
2  young boy vanishes small town uncovers mystery...         0
3  sheriff deputy rick grime awakens coma find po...         0
4  bored unhappy lord hell lucifer morningstar ab...         0


In [5]:
# filter rows where the word "love" appears in the 'cleaned_overview' column
love_shows = df[df['cleaned_overview'].str.contains(r'\blove\b', case=False, na=False)]

html_output = love_shows[['name', 'popularity']].to_html()
display(HTML(f'<div style="max-height: 300px; overflow-y: scroll;">{html_output}</div>'))

Unnamed: 0,name,popularity
16,Euphoria,197.456
51,Anne with an E,64.727
68,Teen Wolf,162.181
96,Love Is In The Air,261.128
114,SpongeBob SquarePants,47.867
145,Spartacus,163.932
169,The Kardashians,86.223
183,Love Alarm,28.929
189,The Crown,52.673
210,Marvel's Agent Carter,30.075


In [12]:
df['has_love'] = df['cleaned_overview'].fillna('').str.contains(r'\blove\b', case=False, na=False).astype(int)

print(df[['cleaned_overview', 'has_love']].head())

                                    cleaned_overview  has_love
0  seven noble family fight control mythical land...         0
1  carry biggest heist history mysterious man cal...         0
2  young boy vanishes small town uncovers mystery...         0
3  sheriff deputy rick grime awakens coma find po...         0
4  bored unhappy lord hell lucifer morningstar ab...         0


In [14]:
young_shows = df[df['cleaned_overview'].str.contains(r'\byoung\b', case=False, na=False)]

html_output = young_shows[['name', 'popularity']].to_html()
display(HTML(f'<div style="max-height: 300px; overflow-y: scroll;">{html_output}</div>'))

Unnamed: 0,name,popularity
2,Stranger Things,185.711
8,The Good Doctor,681.614
21,Rick and Morty,429.047
27,Friends,214.333
50,Suits,745.23
51,Anne with an E,64.727
57,Mr. Robot,61.986
61,Marvel's Daredevil,53.036
63,Malcolm in the Middle,188.836
67,The Queen's Gambit,49.644


In [15]:
df['has_young'] = df['cleaned_overview'].fillna('').str.contains(r'\byoung\b', case=False, na=False).astype(int)

print(df[['cleaned_overview', 'has_young']].head())

                                    cleaned_overview  has_young
0  seven noble family fight control mythical land...          0
1  carry biggest heist history mysterious man cal...          0
2  young boy vanishes small town uncovers mystery...          1
3  sheriff deputy rick grime awakens coma find po...          0
4  bored unhappy lord hell lucifer morningstar ab...          0


In [17]:
school_shows = df[df['cleaned_overview'].str.contains(r'\bschool\b', case=False, na=False)]

html_output = school_shows[['name', 'popularity']].to_html()
display(HTML(f'<div style="max-height: 300px; overflow-y: scroll;">{html_output}</div>'))

Unnamed: 0,name,popularity
16,Euphoria,197.456
19,Elite,100.711
21,Rick and Morty,429.047
26,Wednesday,164.275
29,Sex Education,1008.977
41,Rebelde,92.914
49,My Hero Academia,31.82
50,Suits,745.23
55,Miraculous: Tales of Ladybug & Cat Noir,358.394
68,Teen Wolf,162.181


In [18]:
df['has_school'] = df['cleaned_overview'].fillna('').str.contains(r'\bschool\b', case=False, na=False).astype(int)

print(df[['cleaned_overview', 'has_school']].head())

                                    cleaned_overview  has_school
0  seven noble family fight control mythical land...           0
1  carry biggest heist history mysterious man cal...           0
2  young boy vanishes small town uncovers mystery...           0
3  sheriff deputy rick grime awakens coma find po...           0
4  bored unhappy lord hell lucifer morningstar ab...           0


In [20]:
man_shows = df[df['cleaned_overview'].str.contains(r'\bman\b', case=False, na=False)]

html_output = man_shows[['name', 'popularity']].to_html()
display(HTML(f'<div style="max-height: 300px; overflow-y: scroll;">{html_output}</div>'))

Unnamed: 0,name,popularity
1,Money Heist,96.354
10,The Flash,319.767
21,Rick and Morty,429.047
35,Arrow,174.419
48,Prison Break,213.262
54,Better Call Saul,104.469
69,Smallville,243.557
84,Pablo Escobar: The Drug Lord,359.573
91,Siren,49.087
92,Gotham,100.464


In [21]:
df['has_man'] = df['cleaned_overview'].fillna('').str.contains(r'\bman\b', case=False, na=False).astype(int)

print(df[['cleaned_overview', 'has_man']].head())

                                    cleaned_overview  has_man
0  seven noble family fight control mythical land...        0
1  carry biggest heist history mysterious man cal...        1
2  young boy vanishes small town uncovers mystery...        0
3  sheriff deputy rick grime awakens coma find po...        0
4  bored unhappy lord hell lucifer morningstar ab...        0


In [23]:
girl_shows = df[df['cleaned_overview'].str.contains(r'\bgirl\b', case=False, na=False)]

html_output = girl_shows[['name', 'popularity']].to_html()
display(HTML(f'<div style="max-height: 300px; overflow-y: scroll;">{html_output}</div>'))

Unnamed: 0,name,popularity
2,Stranger Things,185.711
22,The Vampire Diaries,633.313
43,The Act,20.258
51,Anne with an E,64.727
59,The Last of Us,145.629
64,Family Guy,1073.325
65,13 Reasons Why,131.916
67,The Queen's Gambit,49.644
91,Siren,49.087
108,Dragon Ball,7.884


In [24]:
df['has_girl'] = df['cleaned_overview'].fillna('').str.contains(r'\bgirl\b', case=False, na=False).astype(int)

print(df[['cleaned_overview', 'has_girl']].head())

                                    cleaned_overview  has_girl
0  seven noble family fight control mythical land...         0
1  carry biggest heist history mysterious man cal...         0
2  young boy vanishes small town uncovers mystery...         1
3  sheriff deputy rick grime awakens coma find po...         0
4  bored unhappy lord hell lucifer morningstar ab...         0


In [26]:
father_shows = df[df['cleaned_overview'].str.contains(r'\bfather\b', case=False, na=False)]

html_output = father_shows[['name', 'popularity']].to_html()
display(HTML(f'<div style="max-height: 300px; overflow-y: scroll;">{html_output}</div>'))

Unnamed: 0,name,popularity
17,The Umbrella Academy,51.859
28,Supernatural,439.934
53,How I Met Your Mother,319.144
71,Invincible,74.551
80,House of the Dragon,182.641
94,The Originals,99.452
111,The End of the F***ing World,31.754
150,See,197.199
179,American Dad!,196.512
182,InuYasha,89.589


In [27]:
df['has_father'] = df['cleaned_overview'].fillna('').str.contains(r'\bfather\b', case=False, na=False).astype(int)

print(df[['cleaned_overview', 'has_father']].head())

                                    cleaned_overview  has_father
0  seven noble family fight control mythical land...           0
1  carry biggest heist history mysterious man cal...           0
2  young boy vanishes small town uncovers mystery...           0
3  sheriff deputy rick grime awakens coma find po...           0
4  bored unhappy lord hell lucifer morningstar ab...           0


In [30]:
print(df['has_father'].value_counts())
print(df['has_girl'].value_counts())
print(df['has_man'].value_counts())
print(df['has_school'].value_counts())
print(df['has_young'].value_counts())
print(df['has_family'].value_counts())
print(df['has_life'].value_counts())
print(df['has_love'].value_counts())


has_father
0    165738
1      2901
Name: count, dtype: int64
has_girl
0    163896
1      4743
Name: count, dtype: int64
has_man
0    164446
1      4193
Name: count, dtype: int64
has_school
0    164457
1      4182
Name: count, dtype: int64
has_young
0    162654
1      5985
Name: count, dtype: int64
has_family
0    160275
1      8364
Name: count, dtype: int64
has_life
0    153500
1     15139
Name: count, dtype: int64
has_love
0    160945
1      7694
Name: count, dtype: int64


In [31]:
# loop through each word feature and calculate the average popularity
popularity_column = 'popularity'
features = ['has_father', 'has_girl', 'has_man', 'has_school', 'has_young', 'has_family', 'has_life', 'has_love']

for feature in features:
	avg_popularity = df[df[feature] == 1][popularity_column].mean()
	print(f"Average popularity for shows with '{feature}': {avg_popularity:.2f}")


Average popularity for shows with 'has_father': 9.98
Average popularity for shows with 'has_girl': 8.61
Average popularity for shows with 'has_man': 9.98
Average popularity for shows with 'has_school': 10.74
Average popularity for shows with 'has_young': 11.16
Average popularity for shows with 'has_family': 11.93
Average popularity for shows with 'has_life': 10.26
Average popularity for shows with 'has_love': 9.15


In [None]:
df.to_csv("TMDB_tv_dataset_v3.csv", index=False)