# Data Preprocessing

### Handle Missing Values  

In [None]:
import pandas as pd
import os
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [175]:
TMDB_filename = os.path.join(os.getcwd(), "TMDB_tv_dataset_v3.csv")
df = pd.read_csv(TMDB_filename)

In [None]:
# Inspect the structure of the data
print(df.info())

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
df.shape

In [None]:
# Verify the data types of all columns
print(df.dtypes)

In [None]:
# Fill missing values for numerical columns with mean
numerical_cols = df.select_dtypes(include=['float64', 'int64', 'bool']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

In [176]:
df

Unnamed: 0,id,name,number_of_seasons,number_of_episodes,original_language,vote_count,vote_average,overview,adult,backdrop_path,...,Mystery,News,Reality,Romance,Sci-Fi & Fantasy,Soap,Talk,Unknown,War & Politics,Western
0,0.005565,Game of Thrones,0.033333,0.003503,en,1.000000,0.8442,Seven noble families fight for control of the ...,0,/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg,...,0,0,0,0,1,0,0,0,0,0
1,0.284401,Money Heist,0.012500,0.001967,es,0.816031,0.8257,"To carry out the biggest heist in history, a m...",0,/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg,...,0,0,0,0,0,0,0,0,0,0
2,0.265636,Stranger Things,0.016667,0.001632,en,0.739397,0.8624,"When a young boy vanishes, a small town uncove...",0,/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg,...,1,0,0,0,1,0,0,0,0,0
3,0.005577,The Walking Dead,0.045833,0.008494,en,0.706044,0.8121,Sheriff's deputy Rick Grimes awakens from a co...,0,/x4salpjB11umlUOltfNvSSrjSXm.jpg,...,0,0,0,0,1,0,0,0,0,0
4,0.251473,Lucifer,0.025000,0.004463,en,0.634579,0.8486,"Bored and unhappy as the Lord of Hell, Lucifer...",0,/aDBRtunw49UF4XmqfyNuD9nlYIu.jpg,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167054,0.951778,母乳酱想要喷出来,0.004167,0.000048,zh,0.000000,0.0000,Unknown,0,Unknown,...,0,0,0,0,0,0,0,1,0,0
167055,0.960161,Barbie Dream Squad,0.004167,0.000048,es,0.000000,0.0000,Unknown,0,Unknown,...,0,0,0,0,0,0,0,1,0,0
167056,0.958135,Picasso,0.004167,0.000048,bn,0.000000,0.0000,"Murder, art and a journalist's relentless ques...",0,/fseBBbvHtdcZS5M7bSjOxYsVe5.jpg,...,0,0,0,0,0,0,0,0,0,0
167057,0.958139,女子大生危険なアルバイト,0.004167,0.000048,en,0.000000,0.0000,A college student gets into trouble when she m...,0,Unknown,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# convert the boolean values to integers (True -> 1, False -> 0)
df['adult'] = df['adult'].astype(int)

In [177]:
df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes',
       'original_language', 'vote_count', 'vote_average', 'overview', 'adult',
       'backdrop_path', 'first_air_date', 'last_air_date', 'homepage',
       'in_production', 'original_name', 'popularity', 'poster_path', 'type',
       'status', 'tagline', 'genres', 'created_by', 'languages', 'networks',
       'origin_country', 'spoken_languages', 'production_companies',
       'production_countries', 'episode_run_time', 'Action & Adventure',
       'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
       'History', 'Kids', 'Music', 'Musical', 'Mystery', 'News', 'Reality',
       'Romance', 'Sci-Fi & Fantasy', 'Soap', 'Talk', 'Unknown',
       'War & Politics', 'Western'],
      dtype='object')

In [178]:
'''
the changes I made here are that I filter out the 'object' data types and then check to see if it is not the
'genres' column. if it is not, then I add it to the 'columns_to_fill" list. this is because I am labeling
shows without a genre as 'Unknown' in the one-hot encoding step. then after that, I apply the mode of the
specific column to the missing value for now.
'''

categorical_cols = df.select_dtypes(include=['object']).columns
columns_to_fill = [col for col in categorical_cols if col != 'genres']
df[columns_to_fill] = df[columns_to_fill].apply(lambda col: col.fillna('Unknown'))

In [179]:
# Verify that there are no missing values
print(df.isnull().sum())

id                      0
name                    0
number_of_seasons       0
number_of_episodes      0
original_language       0
vote_count              0
vote_average            0
overview                0
adult                   0
backdrop_path           0
first_air_date          0
last_air_date           0
homepage                0
in_production           0
original_name           0
popularity              0
poster_path             0
type                    0
status                  0
tagline                 0
genres                  0
created_by              0
languages               0
networks                0
origin_country          0
spoken_languages        0
production_companies    0
production_countries    0
episode_run_time        0
Action & Adventure      0
Animation               0
Comedy                  0
Crime                   0
Documentary             0
Drama                   0
Family                  0
History                 0
Kids                    0
Music       

### Remove Duplicates  

1,580 rows of data were removed due to it being a duplicate row.

In [180]:
# Check how many duplicates are there
num_duplicate_rows = df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicate_rows}")

Number of duplicate rows: 0


In [181]:
# Remove duplicates
df = df.drop_duplicates()

In [182]:
num_duplicates_after = df.duplicated().sum()
print(f"Number of duplicates after removing: {num_duplicates_after}")

Number of duplicates after removing: 0


### One-Hot Encoding of Categorical Features

* Genre One-Hot Encoding

In [None]:
'''
my plan here is to just split each genre as a token by using the comma as a delimiter, then find all
of the unique genres, and then one-hot encode them so I can drop the original genres column
'''

df['genres'] = df['genres'].fillna('Unknown') # for genres that are empty just call them Unknown

df['genres'] = df['genres'].apply(lambda x: x.split(', '))

unique_genres = sorted(set(genre for genres in df['genres'] for genre in genres))

for genre in unique_genres:
  df[genre] = df['genres'].apply(lambda genres: int(genre in genres))

# Do NOT drop genres yet, we will still keep it for the later steps
# df = df.drop('genres', axis=1)

In [None]:
df

In [None]:
# now here I can save the modifications to the csv file
df.to_csv("TMDB_tv_dataset_v3.csv", index=False)

In [None]:
print(df['genres'].isnull().sum())

* Rest of the One-Hot Encoding

In [183]:
# Creating a list of all columns with object values and inspecting their unique values
object_columns = df.select_dtypes(include=['object']).columns.tolist()
object_columns = [col for col in object_columns if col != 'genres']  # Exclude 'genres'

# Check each column individually for unique values
for col in object_columns:
  try:
    unique_count = df[col].nunique()
    print(f"Column '{col}' unique count: {unique_count}")
  except TypeError:
    print(f"Column '{col}' contains unhashable types.")

Column 'name' unique count: 155586
Column 'original_language' unique count: 106
Column 'overview' unique count: 91244
Column 'backdrop_path' unique count: 76301
Column 'first_air_date' unique count: 18287
Column 'last_air_date' unique count: 18706
Column 'homepage' unique count: 49759
Column 'original_name' unique count: 157313
Column 'poster_path' unique count: 106051
Column 'type' unique count: 7
Column 'status' unique count: 6
Column 'tagline' unique count: 5268
Column 'created_by' unique count: 26082
Column 'languages' unique count: 1114
Column 'networks' unique count: 8197
Column 'origin_country' unique count: 793
Column 'spoken_languages' unique count: 947
Column 'production_companies' unique count: 27133
Column 'production_countries' unique count: 1248


In [184]:
# Columns not included are name, overview, backdrop_path, homepage, original_name, poster_path, tagline, languages, spoken_languages,production_countries, and cleaned_overview
encode_cols =['original_language','type', 'status','created_by','networks','origin_country','production_companies']

In [185]:
# Creating a list of the top 20 values in the original_language column
top_10_original_language = df['original_language'].value_counts(ascending = False).head(10).index.tolist()
top_10_original_language

['en', 'zh', 'ja', 'ko', 'de', 'fr', 'es', 'pt', 'ru', 'nl']

In [186]:
# Creating one-hot encoded columns for original_language
for i in top_10_original_language:
    name = 'original-language_' + i
    df[name] = np.where(df['original_language'] == i, 1, 0)

In [187]:
# Removing the original_language column from df and encode_cols
df.drop(columns = ['original_language'], inplace = True)
encode_cols.remove('original_language')
df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       'last_air_date', 'homepage', 'in_production', 'original_name',
       'popularity', 'poster_path', 'type', 'status', 'tagline', 'genres',
       'created_by', 'languages', 'networks', 'origin_country',
       'spoken_languages', 'production_companies', 'production_countries',
       'episode_run_time', 'Action & Adventure', 'Animation', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'History', 'Kids', 'Music',
       'Musical', 'Mystery', 'News', 'Reality', 'Romance', 'Sci-Fi & Fantasy',
       'Soap', 'Talk', 'Unknown', 'War & Politics', 'Western',
       'original-language_en', 'original-language_zh', 'original-language_ja',
       'original-language_ko', 'original-language_de', 'original-language_fr',
       'original-language_es', 'original-language_pt', 'original-language_ru',
       'original-language_nl']

In [188]:
df

Unnamed: 0,id,name,number_of_seasons,number_of_episodes,vote_count,vote_average,overview,adult,backdrop_path,first_air_date,...,original-language_en,original-language_zh,original-language_ja,original-language_ko,original-language_de,original-language_fr,original-language_es,original-language_pt,original-language_ru,original-language_nl
0,0.005565,Game of Thrones,0.033333,0.003503,1.000000,0.8442,Seven noble families fight for control of the ...,0,/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg,2011-04-17,...,1,0,0,0,0,0,0,0,0,0
1,0.284401,Money Heist,0.012500,0.001967,0.816031,0.8257,"To carry out the biggest heist in history, a m...",0,/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg,2017-05-02,...,0,0,0,0,0,0,1,0,0,0
2,0.265636,Stranger Things,0.016667,0.001632,0.739397,0.8624,"When a young boy vanishes, a small town uncove...",0,/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg,2016-07-15,...,1,0,0,0,0,0,0,0,0,0
3,0.005577,The Walking Dead,0.045833,0.008494,0.706044,0.8121,Sheriff's deputy Rick Grimes awakens from a co...,0,/x4salpjB11umlUOltfNvSSrjSXm.jpg,2010-10-31,...,1,0,0,0,0,0,0,0,0,0
4,0.251473,Lucifer,0.025000,0.004463,0.634579,0.8486,"Bored and unhappy as the Lord of Hell, Lucifer...",0,/aDBRtunw49UF4XmqfyNuD9nlYIu.jpg,2016-01-25,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167054,0.951778,母乳酱想要喷出来,0.004167,0.000048,0.000000,0.0000,Unknown,0,Unknown,2023-03-03,...,0,1,0,0,0,0,0,0,0,0
167055,0.960161,Barbie Dream Squad,0.004167,0.000048,0.000000,0.0000,Unknown,0,Unknown,Unknown,...,0,0,0,0,0,0,1,0,0,0
167056,0.958135,Picasso,0.004167,0.000048,0.000000,0.0000,"Murder, art and a journalist's relentless ques...",0,/fseBBbvHtdcZS5M7bSjOxYsVe5.jpg,2023-11-30,...,0,0,0,0,0,0,0,0,0,0
167057,0.958139,女子大生危険なアルバイト,0.004167,0.000048,0.000000,0.0000,A college student gets into trouble when she m...,0,Unknown,1990-02-10,...,1,0,0,0,0,0,0,0,0,0


In [189]:
filtered_df = df[df['created_by'] != 'Unknown']

# Creating a list of the top 10 values in the created_by column
top_10_created_by = filtered_df['created_by'].value_counts(ascending=False).head(10).index.tolist()
top_10_created_by


['Shotaro Ishinomori',
 'John de Mol',
 'Adrián Suar',
 'Simon Fuller',
 'Ekta Kapoor',
 'Na Young-seok',
 'Yang Li-Hua',
 'Joseph Barbera, William Hanna',
 'R.J. Nuevas',
 'Mark Burnett']

In [190]:
# Creating one-hot encoded columns for created_by
for i in top_10_created_by:
    name = 'created-by_' + i
    df[name] = np.where(df['created_by'] == i, 1, 0)

In [191]:
# Removing the created_by column from df and encode_cols
df.drop(columns = ['created_by'], inplace = True)
encode_cols.remove('created_by')

df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       'last_air_date', 'homepage', 'in_production', 'original_name',
       'popularity', 'poster_path', 'type', 'status', 'tagline', 'genres',
       'languages', 'networks', 'origin_country', 'spoken_languages',
       'production_companies', 'production_countries', 'episode_run_time',
       'Action & Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'History', 'Kids', 'Music', 'Musical', 'Mystery',
       'News', 'Reality', 'Romance', 'Sci-Fi & Fantasy', 'Soap', 'Talk',
       'Unknown', 'War & Politics', 'Western', 'original-language_en',
       'original-language_zh', 'original-language_ja', 'original-language_ko',
       'original-language_de', 'original-language_fr', 'original-language_es',
       'original-language_pt', 'original-language_ru', 'original-language_nl',
       'created-by_S

In [192]:
df

Unnamed: 0,id,name,number_of_seasons,number_of_episodes,vote_count,vote_average,overview,adult,backdrop_path,first_air_date,...,created-by_Shotaro Ishinomori,created-by_John de Mol,created-by_Adrián Suar,created-by_Simon Fuller,created-by_Ekta Kapoor,created-by_Na Young-seok,created-by_Yang Li-Hua,"created-by_Joseph Barbera, William Hanna",created-by_R.J. Nuevas,created-by_Mark Burnett
0,0.005565,Game of Thrones,0.033333,0.003503,1.000000,0.8442,Seven noble families fight for control of the ...,0,/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg,2011-04-17,...,0,0,0,0,0,0,0,0,0,0
1,0.284401,Money Heist,0.012500,0.001967,0.816031,0.8257,"To carry out the biggest heist in history, a m...",0,/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg,2017-05-02,...,0,0,0,0,0,0,0,0,0,0
2,0.265636,Stranger Things,0.016667,0.001632,0.739397,0.8624,"When a young boy vanishes, a small town uncove...",0,/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg,2016-07-15,...,0,0,0,0,0,0,0,0,0,0
3,0.005577,The Walking Dead,0.045833,0.008494,0.706044,0.8121,Sheriff's deputy Rick Grimes awakens from a co...,0,/x4salpjB11umlUOltfNvSSrjSXm.jpg,2010-10-31,...,0,0,0,0,0,0,0,0,0,0
4,0.251473,Lucifer,0.025000,0.004463,0.634579,0.8486,"Bored and unhappy as the Lord of Hell, Lucifer...",0,/aDBRtunw49UF4XmqfyNuD9nlYIu.jpg,2016-01-25,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167054,0.951778,母乳酱想要喷出来,0.004167,0.000048,0.000000,0.0000,Unknown,0,Unknown,2023-03-03,...,0,0,0,0,0,0,0,0,0,0
167055,0.960161,Barbie Dream Squad,0.004167,0.000048,0.000000,0.0000,Unknown,0,Unknown,Unknown,...,0,0,0,0,0,0,0,0,0,0
167056,0.958135,Picasso,0.004167,0.000048,0.000000,0.0000,"Murder, art and a journalist's relentless ques...",0,/fseBBbvHtdcZS5M7bSjOxYsVe5.jpg,2023-11-30,...,0,0,0,0,0,0,0,0,0,0
167057,0.958139,女子大生危険なアルバイト,0.004167,0.000048,0.000000,0.0000,A college student gets into trouble when she m...,0,Unknown,1990-02-10,...,0,0,0,0,0,0,0,0,0,0


In [193]:
# Creating a list of the top 11 values in the networks column
filtered_df = df[df['networks'] != 'Unknown']

top_10_networks = filtered_df['networks'].value_counts(ascending = False).head(10).index.tolist()
top_10_networks

['BBC One',
 'YouTube',
 'Netflix',
 'ITV1',
 'BBC Two',
 'ABC',
 'NBC',
 'TVB Jade',
 'CBS',
 'Channel 4']

In [194]:
# Creating one-hot encoded columns for networks

for i in top_10_networks:
    name = 'networks_' + i
    df[name] = np.where(df['networks'] == i, 1, 0)

In [195]:
# Removing the networks column from df and encode_cols
df.drop(columns = ['networks'], inplace = True)
encode_cols.remove('networks')

df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       'last_air_date', 'homepage', 'in_production', 'original_name',
       'popularity', 'poster_path', 'type', 'status', 'tagline', 'genres',
       'languages', 'origin_country', 'spoken_languages',
       'production_companies', 'production_countries', 'episode_run_time',
       'Action & Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'History', 'Kids', 'Music', 'Musical', 'Mystery',
       'News', 'Reality', 'Romance', 'Sci-Fi & Fantasy', 'Soap', 'Talk',
       'Unknown', 'War & Politics', 'Western', 'original-language_en',
       'original-language_zh', 'original-language_ja', 'original-language_ko',
       'original-language_de', 'original-language_fr', 'original-language_es',
       'original-language_pt', 'original-language_ru', 'original-language_nl',
       'created-by_Shotaro Ishin

In [196]:
df

Unnamed: 0,id,name,number_of_seasons,number_of_episodes,vote_count,vote_average,overview,adult,backdrop_path,first_air_date,...,networks_BBC One,networks_YouTube,networks_Netflix,networks_ITV1,networks_BBC Two,networks_ABC,networks_NBC,networks_TVB Jade,networks_CBS,networks_Channel 4
0,0.005565,Game of Thrones,0.033333,0.003503,1.000000,0.8442,Seven noble families fight for control of the ...,0,/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg,2011-04-17,...,0,0,0,0,0,0,0,0,0,0
1,0.284401,Money Heist,0.012500,0.001967,0.816031,0.8257,"To carry out the biggest heist in history, a m...",0,/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg,2017-05-02,...,0,0,0,0,0,0,0,0,0,0
2,0.265636,Stranger Things,0.016667,0.001632,0.739397,0.8624,"When a young boy vanishes, a small town uncove...",0,/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg,2016-07-15,...,0,0,1,0,0,0,0,0,0,0
3,0.005577,The Walking Dead,0.045833,0.008494,0.706044,0.8121,Sheriff's deputy Rick Grimes awakens from a co...,0,/x4salpjB11umlUOltfNvSSrjSXm.jpg,2010-10-31,...,0,0,0,0,0,0,0,0,0,0
4,0.251473,Lucifer,0.025000,0.004463,0.634579,0.8486,"Bored and unhappy as the Lord of Hell, Lucifer...",0,/aDBRtunw49UF4XmqfyNuD9nlYIu.jpg,2016-01-25,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167054,0.951778,母乳酱想要喷出来,0.004167,0.000048,0.000000,0.0000,Unknown,0,Unknown,2023-03-03,...,0,0,0,0,0,0,0,0,0,0
167055,0.960161,Barbie Dream Squad,0.004167,0.000048,0.000000,0.0000,Unknown,0,Unknown,Unknown,...,0,0,0,0,0,0,0,0,0,0
167056,0.958135,Picasso,0.004167,0.000048,0.000000,0.0000,"Murder, art and a journalist's relentless ques...",0,/fseBBbvHtdcZS5M7bSjOxYsVe5.jpg,2023-11-30,...,0,0,0,0,0,0,0,0,0,0
167057,0.958139,女子大生危険なアルバイト,0.004167,0.000048,0.000000,0.0000,A college student gets into trouble when she m...,0,Unknown,1990-02-10,...,0,0,0,0,0,0,0,0,0,0


In [197]:
# Creating a list of the top 10 values in the origin_country column
filtered_df = df[df['origin_country'] != 'Unknown']

top_10_origin_country = filtered_df['origin_country'].value_counts(ascending = False).head(10).index.tolist()
top_10_origin_country

['US', 'JP', 'GB', 'CN', 'DE', 'KR', 'CA', 'FR', 'AU', 'BR']

In [None]:
df['origin_country'] = df['origin_country'].fillna('Unknown')

one_hot_encoded = pd.get_dummies(df['origin_country'], prefix='origin-country')

one_hot_encoded = one_hot_encoded[[f'origin-country_{i}' for i in top_10_origin_country if f'origin-country_{i}' in one_hot_encoded]]
one_hot_encoded['origin-country_OTHER'] = (~df['origin_country'].isin(top_10_origin_country)).astype(int)

df = pd.concat([df, one_hot_encoded], axis=1)

In [199]:
# Removing the origin_country column from df and encode_cols
df.drop(columns = ['origin_country'], inplace = True)
encode_cols.remove('origin_country')

df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       'last_air_date', 'homepage', 'in_production', 'original_name',
       'popularity', 'poster_path', 'type', 'status', 'tagline', 'genres',
       'languages', 'spoken_languages', 'production_companies',
       'production_countries', 'episode_run_time', 'Action & Adventure',
       'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
       'History', 'Kids', 'Music', 'Musical', 'Mystery', 'News', 'Reality',
       'Romance', 'Sci-Fi & Fantasy', 'Soap', 'Talk', 'Unknown',
       'War & Politics', 'Western', 'original-language_en',
       'original-language_zh', 'original-language_ja', 'original-language_ko',
       'original-language_de', 'original-language_fr', 'original-language_es',
       'original-language_pt', 'original-language_ru', 'original-language_nl',
       'created-by_Shotaro Ishinomori', 'created-b

In [200]:
df

Unnamed: 0,id,name,number_of_seasons,number_of_episodes,vote_count,vote_average,overview,adult,backdrop_path,first_air_date,...,origin-country_JP,origin-country_GB,origin-country_CN,origin-country_DE,origin-country_KR,origin-country_CA,origin-country_FR,origin-country_AU,origin-country_BR,origin-country_OTHER
0,0.005565,Game of Thrones,0.033333,0.003503,1.000000,0.8442,Seven noble families fight for control of the ...,0,/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg,2011-04-17,...,False,False,False,False,False,False,False,False,False,0
1,0.284401,Money Heist,0.012500,0.001967,0.816031,0.8257,"To carry out the biggest heist in history, a m...",0,/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg,2017-05-02,...,False,False,False,False,False,False,False,False,False,1
2,0.265636,Stranger Things,0.016667,0.001632,0.739397,0.8624,"When a young boy vanishes, a small town uncove...",0,/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg,2016-07-15,...,False,False,False,False,False,False,False,False,False,0
3,0.005577,The Walking Dead,0.045833,0.008494,0.706044,0.8121,Sheriff's deputy Rick Grimes awakens from a co...,0,/x4salpjB11umlUOltfNvSSrjSXm.jpg,2010-10-31,...,False,False,False,False,False,False,False,False,False,0
4,0.251473,Lucifer,0.025000,0.004463,0.634579,0.8486,"Bored and unhappy as the Lord of Hell, Lucifer...",0,/aDBRtunw49UF4XmqfyNuD9nlYIu.jpg,2016-01-25,...,False,False,False,False,False,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167054,0.951778,母乳酱想要喷出来,0.004167,0.000048,0.000000,0.0000,Unknown,0,Unknown,2023-03-03,...,False,False,True,False,False,False,False,False,False,0
167055,0.960161,Barbie Dream Squad,0.004167,0.000048,0.000000,0.0000,Unknown,0,Unknown,Unknown,...,False,False,False,False,False,False,False,False,False,0
167056,0.958135,Picasso,0.004167,0.000048,0.000000,0.0000,"Murder, art and a journalist's relentless ques...",0,/fseBBbvHtdcZS5M7bSjOxYsVe5.jpg,2023-11-30,...,False,False,False,False,False,False,False,False,False,1
167057,0.958139,女子大生危険なアルバイト,0.004167,0.000048,0.000000,0.0000,A college student gets into trouble when she m...,0,Unknown,1990-02-10,...,False,False,False,False,False,False,False,False,False,0


In [None]:
# Creating a list of the top 10 values in the production_companies column
filtered_df = df[df['production_companies'] != 'Unknown']

top_10_production_companies = filtered_df['production_companies'].value_counts(ascending = False).head(10).index.tolist()
top_10_production_companies

In [None]:
'''
did the same changes to this code cell as well to fix the warning message
'''

one_hot_encoded_production_companies = pd.DataFrame() # start with an empty dataframe

for i in top_10_origin_country:
    one_hot_encoded_production_companies['production-companies_' + i] = np.where(df['production_companies'] == i, 1, 0)

# the we can concatenate the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_encoded_production_companies], axis=1)

In [None]:
one_hot_encoded_production_companies.head()

In [None]:
# Removing the production_companies column from df and encode_cols
df.drop(columns = ['production_companies'], inplace = True)
encode_cols.remove('production_companies')

df.columns

In [None]:
# One-hot encoding remaining columns
for colname in encode_cols:
    df_encoded = pd.get_dummies(df[colname], prefix=colname+'')
    df = df.join(df_encoded)

df.columns

In [None]:
# Removing remaining original cols from df
df.drop(columns=encode_cols,axis=1,inplace=True)

df.columns

In [None]:
df.tail(10)

In [None]:
# Converting first_air_date and last_air_date columns to DateTime referring to https://www.youtube.com/watch?v=f7LODKIjtaA

'''
df['first_air_date'] = pd.to_datetime(df['first_air_date'], format = '%Y-%m-%d')
df['last_air_date'] = pd.to_datetime(df['last_air_date'], format = '%Y-%m-%d' )
'''

df['first_air_date'] = pd.to_datetime(df['first_air_date'], errors='coerce')
df['last_air_date'] = pd.to_datetime(df['last_air_date'], errors='coerce')

print(df['first_air_date'].dtypes)
print(df['last_air_date'].dtypes)

In [None]:
print(df['first_air_date'])

In [None]:
print(df['last_air_date'])

In [None]:
# Creating a function to create seasons for each month
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'WINTER'
    elif month in [3, 4, 5]:
        return 'SPRING'
    elif month in [6, 7, 8]:
        return 'SUMMER'
    elif month in [9, 10, 11]:
        return 'FALL'
    else:
        return 'UNKNOWN'

In [None]:
# Creating a column with the seasons for first_air_date
df['first_air_date_season'] = df['first_air_date'].apply(get_season)
df['first_air_date_season']

In [None]:
# Creating columns for first_air_date_season as boolean values
df['first_air_date_winter'] = df['first_air_date_season'] == 'WINTER'
df['first_air_date_spring'] = df['first_air_date_season'] == 'SPRING'
df['first_air_date_summer'] = df['first_air_date_season'] == 'SUMMER'
df['first_air_date_fall'] = df['first_air_date_season'] == 'FALL'

In [None]:
# Repeating same process for last_air_date
df['last_air_date_season'] = df['last_air_date'].apply(get_season)
df['last_air_date_season']

In [None]:
df['last_air_date_winter'] = df['last_air_date_season'] == 'WINTER'
df['last_air_date_spring'] = df['last_air_date_season'] == 'SPRING'
df['last_air_date_summer'] = df['last_air_date_season'] == 'SUMMER'
df['last_air_date_fall'] = df['last_air_date_season'] == 'FALL'

In [None]:
# save data changed during one-hot encoding to csv file
df.to_csv('TMDB_tv_dataset_v3.csv', index=False)

In [None]:
# Inspecting columns to ensure process was done correctly
df.tail()