# **Data Understanding**
Mahsa Ileslamlou 8/31/2024

In [10]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

### **1. Build DataFrame**

In [11]:
TMDB_filename = os.path.join(os.getcwd(), "TMDB_tv_dataset_v3.csv")
df = pd.read_csv(TMDB_filename)

In [12]:
df.head(10)

Unnamed: 0,id,name,number_of_seasons,number_of_episodes,original_language,vote_count,vote_average,overview,adult,backdrop_path,...,tagline,genres,created_by,languages,networks,origin_country,spoken_languages,production_companies,production_countries,episode_run_time
0,1399,Game of Thrones,8,73,en,21857,8.442,Seven noble families fight for control of the ...,False,/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg,...,Winter Is Coming,"Sci-Fi & Fantasy, Drama, Action & Adventure","David Benioff, D.B. Weiss",en,HBO,US,English,"Revolution Sun Studios, Television 360, Genera...","United Kingdom, United States of America",0
1,71446,Money Heist,3,41,es,17836,8.257,"To carry out the biggest heist in history, a m...",False,/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg,...,The perfect robbery.,"Crime, Drama",Álex Pina,es,"Netflix, Antena 3",ES,Español,Vancouver Media,Spain,70
2,66732,Stranger Things,4,34,en,16161,8.624,"When a young boy vanishes, a small town uncove...",False,/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg,...,Every ending has a beginning.,"Drama, Sci-Fi & Fantasy, Mystery","Matt Duffer, Ross Duffer",en,Netflix,US,English,"21 Laps Entertainment, Monkey Massacre Product...",United States of America,0
3,1402,The Walking Dead,11,177,en,15432,8.121,Sheriff's deputy Rick Grimes awakens from a co...,False,/x4salpjB11umlUOltfNvSSrjSXm.jpg,...,Fight the dead. Fear the living.,"Action & Adventure, Drama, Sci-Fi & Fantasy",Frank Darabont,en,AMC,US,English,"AMC Studios, Circle of Confusion, Valhalla Mot...",United States of America,42
4,63174,Lucifer,6,93,en,13870,8.486,"Bored and unhappy as the Lord of Hell, Lucifer...",False,/aDBRtunw49UF4XmqfyNuD9nlYIu.jpg,...,It's good to be bad.,"Crime, Sci-Fi & Fantasy",Tom Kapinos,en,"FOX, Netflix",US,English,"Warner Bros. Television, DC Entertainment, Jer...",United States of America,45
5,69050,Riverdale,7,137,en,13180,8.479,"Set in the present, the series offers a bold, ...",False,/soQgquPkLmUu9eKLJJzuA4KZDyi.jpg,...,"To save the future, they must survive the past.","Crime, Drama, Mystery",Roberto Aguirre-Sacasa,en,The CW,US,English,"Warner Bros. Television, Berlanti Productions,...",United States of America,45
6,93405,Squid Game,2,9,ko,13053,7.831,Hundreds of cash-strapped players accept a str...,False,/2meX1nMdScFOoV4370rqHWKmXhY.jpg,...,45.6 billion won is child's play,"Action & Adventure, Mystery, Drama",Hwang Dong-hyuk,"en, ko, ur",Netflix,KR,"English, 한국어/조선말, اردو","Siren Pictures, Firstman Studio",South Korea,0
7,1396,Breaking Bad,5,62,en,12398,8.89,"When Walter White, a New Mexico chemistry teac...",False,/tsRy63Mu5cu8etL1X7ZLyf7UP1M.jpg,...,Change the equation.,"Drama, Crime",Vince Gilligan,"en, de, es",AMC,US,"English, Deutsch, Español","Sony Pictures Television Studios, High Bridge ...",United States of America,0
8,71712,The Good Doctor,6,116,en,11768,8.503,"Shaun Murphy, a young surgeon with autism and ...",False,/xXRsKNJHTOGrs5wfYAxkbM2RiyT.jpg,...,Everyone operates differently.,Drama,David Shore,en,ABC,US,English,"ABC Studios, 3AD, Sony Pictures Television Stu...",United States of America,43
9,85271,WandaVision,1,9,en,11308,8.3,Wanda Maximoff and Vision—two super-powered be...,False,/lOr9NKxh4vMweufMOUDJjJhCRHW.jpg,...,Experience a new vision of reality.,"Sci-Fi & Fantasy, Mystery, Drama",Jac Schaeffer,en,Disney+,US,English,Marvel Studios,United States of America,0


In [13]:
df.shape

(168639, 29)

In [16]:
#Shows the various data types in our dataset
df.dtypes

Unnamed: 0,0
id,int64
name,object
number_of_seasons,int64
number_of_episodes,int64
original_language,object
vote_count,int64
vote_average,float64
overview,object
adult,bool
backdrop_path,object


In [23]:
df.columns.tolist()

['id',
 'name',
 'number_of_seasons',
 'number_of_episodes',
 'original_language',
 'vote_count',
 'vote_average',
 'overview',
 'adult',
 'backdrop_path',
 'first_air_date',
 'last_air_date',
 'homepage',
 'in_production',
 'original_name',
 'popularity',
 'poster_path',
 'type',
 'status',
 'tagline',
 'genres',
 'created_by',
 'languages',
 'networks',
 'origin_country',
 'spoken_languages',
 'production_companies',
 'production_countries',
 'episode_run_time']

In [18]:
#Inspecting the label
df['popularity']

Unnamed: 0,popularity
0,1083.917
1,96.354
2,185.711
3,489.746
4,416.668
...,...
168634,1.400
168635,0.600
168636,1.895
168637,0.600


Below, I'm observing the unique data values in various columns in order to determine what would be a good predictor for the popularity score in our model and which columns we should consider removing.

In [40]:
df['original_language'].unique()

array(['en', 'es', 'ko', 'ja', 'de', 'fr', 'tr', 'pt', 'da', 'ca', 'sv',
       'no', 'th', 'it', 'zh', 'ar', 'ru', 'is', 'tl', 'he', 'pl', 'nl',
       'hi', 'fi', 'lb', 'cy', 'gl', 'uk', 'hu', 'cs', 'la', 'ro', 'bg',
       'cn', 'el', 'vi', 'sr', 'ta', 'hr', 'zu', 'fa', 'xx', 'bn', 'id',
       'ms', 'sk', 'ur', 'te', 'sh', 'af', 'kn', 'si', 'ml', 'bs', 'ga',
       'et', 'ab', 'am', 'sq', 'mo', 'ka', 'az', 'nb', 'ku', 'lt', 'lv',
       'eu', 'mr', 'sl', 'hz', 'mi', 'km', 'ne', 'kk', 'as', 'se', 'be',
       'mn', 'pa', 'gu', 'mk', 'mt', 'jv', 'or', 'st', 'sw', 'ti', 'my',
       'so', 'fy', 'za', 'hy', 'uz', 'rm', 'gd', 'av', 'ug', 'ho', 'eo',
       'kv', 'dv', 'lo', 'ht', 'ps', 'ln', 'aa'], dtype=object)

In [32]:
df['vote_count'].unique()

array([21857, 17836, 16161, ...,     2,     1,     0])

In [33]:
df['vote_average'].unique()

array([8.442, 8.257, 8.624, ..., 2.25 , 0.5  , 0.   ])

In [29]:
df['backdrop_path'].unique()

array(['/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg',
       '/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg',
       '/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg', ...,
       '/rfZx8RZEfNcUNNu91qCIeRZ2x4i.jpg',
       '/jS1ftJ6FfnAuj5No45v5pxAiUQa.jpg',
       '/fseBBbvHtdcZS5M7bSjOxYsVe5.jpg'], dtype=object)

In [28]:
df['poster_path'].unique()

array(['/1XS1oqL89opfnbLl8WnZY1O1uJx.jpg',
       '/reEMJA1uzscCbkpeRJeTT2bjqUp.jpg',
       '/49WJfeN0moxb9IPfGn8AIqMGskD.jpg', ...,
       '/vvm7p60THjcYdnXnXfjCgwPeime.jpg',
       '/ifDs1gBunsaN3lAG4Imc94JcfVD.jpg',
       '/9EO55cThzp98PaM9iVRoZPXjIZ8.jpg'], dtype=object)

In [25]:
df['type'].unique()

array(['Scripted', 'Miniseries', 'Documentary', 'Reality', 'Talk Show',
       'News', 'Video'], dtype=object)

In [27]:
df['status'].unique()

array(['Ended', 'Returning Series', 'Canceled', 'Pilot', 'In Production',
       'Planned'], dtype=object)

In [26]:
df['tagline'].unique()

array(['Winter Is Coming', 'The perfect robbery.',
       'Every ending has a beginning.', ...,
       'Isolated island in the Finnish Archipelago, surrounded by danger and mystery',
       'Famous Sweet Dish of Odisa',
       'Having the world at his feet was not enough.'], dtype=object)

In [36]:
df['created_by'].unique()

array(['David Benioff, D.B. Weiss', 'Álex Pina',
       'Matt Duffer, Ross Duffer', ..., 'Tony Cheuk',
       'Weerachit Thongjila', 'TJ Tommys'], dtype=object)

In [39]:
df['languages'].unique()

array(['en', 'es', 'en, ko, ur', ..., 'bs, sh', 'zh, ms, en', 'az, fa'],
      dtype=object)

In [37]:
df['spoken_languages'].unique()

array(['English', 'Español', 'English, 한국어/조선말, اردو',
       'English, Deutsch, Español', '日本語', 'Deutsch', 'English, Español',
       'Français', 'English, Français, Pусский, Español',
       'English, 한국어/조선말', 'Italiano, Español',
       'Deutsch, Français, Lietuvių, Nederlands, English', 'Türkçe',
       '한국어/조선말', 'Español, English', 'English, Deutsch',
       'Français, English', 'Português, Español', 'English, Türkçe',
       'Español, English, 日本語', 'Español, English, ελληνικά, 普通话',
       'English, 한국어/조선말, Português', 'English, Afrikaans, Íslenska',
       'English, Português, ภาษาไทย', 'English, Dansk, Español',
       'Español, Français, English', 'English, No Language',
       'العربية, English, Français, Deutsch, 日本語, Pусский, Español, Türkçe',
       'English, Română', 'Português', 'English, Français', 'Dansk',
       '日本語, English, Deutsch',
       'Deutsch, 广州话 / 廣州話, Dansk, Français, English, Español, svenska, Polski, Norsk, Português',
       'Català', 'English, 日本

In [43]:
df['production_companies'].unique()

array(['Revolution Sun Studios, Television 360, Generator Entertainment, Bighead Littlehead',
       'Vancouver Media',
       '21 Laps Entertainment, Monkey Massacre Productions', ...,
       'Loose Cannons Content Studio, Vial Content, Triplecom Media Production',
       'Feelgood Bangkok, h8 Studio', 'Amazon Studios, STARK Film'],
      dtype=object)

In [44]:
df['production_countries'].unique()

array(['United Kingdom, United States of America', 'Spain',
       'United States of America', ..., 'China, Thailand',
       'Germany, Brazil', 'Estonia, Finland'], dtype=object)

In [22]:
#Observing the number of unique values that exist with our object data, suggesting we might want to consider one-hot encoding
list = df.select_dtypes(include=['object']).columns.tolist()
print(df[list].nunique())

name                    155586
original_language          106
overview                 91243
backdrop_path            76300
first_air_date           18286
last_air_date            18705
homepage                 49758
original_name           157313
poster_path             106050
type                         7
status                       6
tagline                   5267
genres                    2228
created_by               26081
languages                 1113
networks                  8196
origin_country             792
spoken_languages           946
production_companies     27132
production_countries      1247
dtype: int64


Columns: id, backdrop_path, homepage, and poster_path should not be included in our model.

If we see that the name of the show impacts the popularity score, then we should choose between including name or original_name, not both.

Vote_count and vote_average provide similar information so if we determine that this is an important indictator for the popularity score, we may decide to select one of the two.

Similarly, original_language, languages, and spoken_language also provide similar information in which we may consider selecting one of the three.

For columns such as overview and tagline, this may play little roll in our model unless we decide to use a LLM to evaluate them.

In [15]:
#This will provide a variety of summary statistics for our data
df.describe(include = 'all')

Unnamed: 0,id,name,number_of_seasons,number_of_episodes,original_language,vote_count,vote_average,overview,adult,backdrop_path,...,tagline,genres,created_by,languages,networks,origin_country,spoken_languages,production_companies,production_countries,episode_run_time
count,168639.0,168634,168639.0,168639.0,168639,168639.0,168639.0,93333,168639,77780,...,5330,99713,36496,110050,97589,137609,109280,59342,77511,168639.0
unique,,155586,,,106,,,91243,2,76300,...,5267,2228,26081,1113,8196,792,946,27132,1247,
top,,二十四孝动画全集,,,en,,,Thai Drama 2019.,False,/rYLnnA0GBXATn8kMXCNY0r8mA8Q.jpg,...,drama,Documentary,Shotaro Ishinomori,en,BBC One,US,English,TVB,Japan,
freq,,24,,,76304,,,27,166600,12,...,4,17596,107,38671,2103,28759,39251,1356,9682,
mean,111307.074704,,1.548497,24.465082,,13.305054,2.333843,,,,...,,,,,,,,,,22.603348
std,76451.662352,,2.942872,134.799622,,190.809059,3.454334,,,,...,,,,,,,,,,47.950427
min,1.0,,0.0,0.0,,0.0,0.0,,,,...,,,,,,,,,,0.0
25%,45936.5,,1.0,1.0,,0.0,0.0,,,,...,,,,,,,,,,0.0
50%,97734.0,,1.0,6.0,,0.0,0.0,,,,...,,,,,,,,,,0.0
75%,196923.5,,1.0,20.0,,1.0,6.0,,,,...,,,,,,,,,,42.0


In [20]:
#Checking to see how many missing values exist in the data
null_count = np.sum(df.isnull(), axis = 0)
null_count

Unnamed: 0,0
id,0
name,5
number_of_seasons,0
number_of_episodes,0
original_language,0
vote_count,0
vote_average,0
overview,75306
adult,0
backdrop_path,90859
