# Data Preprocessing

### Handle Missing Values  

In [1]:
import pandas as pd
import os
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
TMDB_filename = os.path.join(os.getcwd(), "TMDB_tv_dataset_v3.csv")
df = pd.read_csv(TMDB_filename)

In [3]:
# Inspect the structure of the data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168639 entries, 0 to 168638
Data columns (total 30 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    168639 non-null  int64  
 1   name                  168634 non-null  object 
 2   number_of_seasons     168639 non-null  int64  
 3   number_of_episodes    168639 non-null  int64  
 4   original_language     168639 non-null  object 
 5   vote_count            168639 non-null  int64  
 6   vote_average          168639 non-null  float64
 7   overview              93333 non-null   object 
 8   adult                 168639 non-null  bool   
 9   backdrop_path         77780 non-null   object 
 10  first_air_date        136903 non-null  object 
 11  last_air_date         138735 non-null  object 
 12  homepage              50998 non-null   object 
 13  in_production         168639 non-null  bool   
 14  original_name         168634 non-null  object 
 15  

In [4]:
# Check for missing values
print(df.isnull().sum())

id                           0
name                         5
number_of_seasons            0
number_of_episodes           0
original_language            0
vote_count                   0
vote_average                 0
overview                 75306
adult                        0
backdrop_path            90859
first_air_date           31736
last_air_date            29904
homepage                117641
in_production                0
original_name                5
popularity                   0
poster_path              59902
type                         0
status                       0
tagline                 163309
genres                   68926
created_by              132143
languages                58589
networks                 71050
origin_country           31030
spoken_languages         59359
production_companies    109297
production_countries     91128
episode_run_time             0
cleaned_overview         75386
dtype: int64


In [5]:
# Verify the data types of all columns
print(df.dtypes)

id                        int64
name                     object
number_of_seasons         int64
number_of_episodes        int64
original_language        object
vote_count                int64
vote_average            float64
overview                 object
adult                      bool
backdrop_path            object
first_air_date           object
last_air_date            object
homepage                 object
in_production              bool
original_name            object
popularity              float64
poster_path              object
type                     object
status                   object
tagline                  object
genres                   object
created_by               object
languages                object
networks                 object
origin_country           object
spoken_languages         object
production_companies     object
production_countries     object
episode_run_time          int64
cleaned_overview         object
dtype: object


In [6]:
# Fill missing values for numerical columns with mean
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

In [7]:
'''
the changes I made here are that I filter out the 'object' data types and then check to see if it is not the
'genres' column. if it is not, then I add it to the 'columns_to_fill" list. this is because I am labeling
shows without a genre as 'Unknown' in the one-hot encoding step. then after that, I apply the mode of the
specific column to the missing value for now.
'''

categorical_cols = df.select_dtypes(include=['object']).columns
columns_to_fill = [col for col in categorical_cols if col != 'genres']
df[columns_to_fill] = df[columns_to_fill].apply(lambda col: col.fillna(col.mode()[0]))

In [8]:
# Verify that there are no missing values
print(df.isnull().sum())

id                          0
name                        0
number_of_seasons           0
number_of_episodes          0
original_language           0
vote_count                  0
vote_average                0
overview                    0
adult                       0
backdrop_path               0
first_air_date              0
last_air_date               0
homepage                    0
in_production               0
original_name               0
popularity                  0
poster_path                 0
type                        0
status                      0
tagline                     0
genres                  68926
created_by                  0
languages                   0
networks                    0
origin_country              0
spoken_languages            0
production_companies        0
production_countries        0
episode_run_time            0
cleaned_overview            0
dtype: int64


### Remove Duplicates  

1,580 rows of data were removed due to it being a duplicate row.

In [9]:
# Check how many duplicates are there
num_duplicate_rows = df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicate_rows}")

Number of duplicate rows: 1580


In [10]:
# Remove duplicates
df = df.drop_duplicates()

In [11]:
num_duplicates_after = df.duplicated().sum()
print(f"Number of duplicates after removing: {num_duplicates_after}")

Number of duplicates after removing: 0


### One-Hot Encoding of Categorical Features

* Genre One-Hot Encoding

In [12]:
'''
my plan here is to just split each genre as a token by using the comma as a delimiter, then find all
of the unique genres, and then one-hot encode them so I can drop the original genres column
'''

df['genres'] = df['genres'].fillna('Unknown') # for genres that are empty just call them Unknown

df['genres'] = df['genres'].apply(lambda x: x.split(', '))

unique_genres = sorted(set(genre for genres in df['genres'] for genre in genres))

for genre in unique_genres:
  df[genre] = df['genres'].apply(lambda genres: int(genre in genres))

# Do NOT drop genres yet, we will still keep it for the later steps
# df = df.drop('genres', axis=1)

In [13]:
# now here I can save the modifications to the csv file
df.to_csv("TMDB_tv_dataset_v3.csv", index=False)

* Rest of the One-Hot Encoding

In [14]:
# Creating a list of all columns with object values and inspecting their unique values
object_columns = df.select_dtypes(include=['object']).columns.tolist()
object_columns = [col for col in object_columns if col != 'genres']  # Exclude 'genres'

# Check each column individually for unique values
for col in object_columns:
  try:
    unique_count = df[col].nunique()
    print(f"Column '{col}' unique count: {unique_count}")
  except TypeError:
    print(f"Column '{col}' contains unhashable types.")

Column 'name' unique count: 155586
Column 'original_language' unique count: 106
Column 'overview' unique count: 91243
Column 'backdrop_path' unique count: 76300
Column 'first_air_date' unique count: 18286
Column 'last_air_date' unique count: 18705
Column 'homepage' unique count: 49758
Column 'original_name' unique count: 157313
Column 'poster_path' unique count: 106050
Column 'type' unique count: 7
Column 'status' unique count: 6
Column 'tagline' unique count: 5267
Column 'created_by' unique count: 26081
Column 'languages' unique count: 1113
Column 'networks' unique count: 8196
Column 'origin_country' unique count: 792
Column 'spoken_languages' unique count: 946
Column 'production_companies' unique count: 27132
Column 'production_countries' unique count: 1247
Column 'cleaned_overview' unique count: 91025


In [15]:
# Columns not included are name, overview, backdrop_path, homepage, original_name, poster_path, tagline, languages, spoken_languages,production_countries, and cleaned_overview
encode_cols =['original_language','type', 'status','created_by','networks','origin_country','production_companies']

In [16]:
# Creating a list of the top 20 values in the original_language column
top_10_original_language = df['original_language'].value_counts(ascending = False).head(10).index.tolist()
top_10_original_language

['en', 'zh', 'ja', 'ko', 'de', 'fr', 'es', 'pt', 'ru', 'nl']

In [17]:
# Creating one-hot encoded columns for original_language
for i in top_10_original_language:
    name = 'original-language_' + i
    df[name] = np.where(df['original_language'] == i, 1, 0)

In [18]:
# Removing the original_language column from df and encode_cols
df.drop(columns = ['original_language'], inplace = True)
encode_cols.remove('original_language')
df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       'last_air_date', 'homepage', 'in_production', 'original_name',
       'popularity', 'poster_path', 'type', 'status', 'tagline', 'genres',
       'created_by', 'languages', 'networks', 'origin_country',
       'spoken_languages', 'production_companies', 'production_countries',
       'episode_run_time', 'cleaned_overview', 'Action & Adventure',
       'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
       'History', 'Kids', 'Music', 'Musical', 'Mystery', 'News', 'Reality',
       'Romance', 'Sci-Fi & Fantasy', 'Soap', 'Talk', 'Unknown',
       'War & Politics', 'Western', 'original-language_en',
       'original-language_zh', 'original-language_ja', 'original-language_ko',
       'original-language_de', 'original-language_fr', 'original-language_es',
       'original-language_pt', 'original-language_ru', 'or

In [19]:
# Creating a list of the top 10 values in the created_by column
top_10_created_by = df['created_by'].value_counts(ascending = False).head(10).index.tolist()
top_10_created_by

['Shotaro Ishinomori',
 'John de Mol',
 'Adrián Suar',
 'Simon Fuller',
 'Ekta Kapoor',
 'Na Young-seok',
 'Yang Li-Hua',
 'Joseph Barbera, William Hanna',
 'R.J. Nuevas',
 'Mark Burnett']

In [20]:
# Creating one-hot encoded columns for created_by
for i in top_10_created_by:
    name = 'created-by_' + i
    df[name] = np.where(df['created_by'] == i, 1, 0)

In [21]:
# Removing the created_by column from df and encode_cols
df.drop(columns = ['created_by'], inplace = True)
encode_cols.remove('created_by')

df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       'last_air_date', 'homepage', 'in_production', 'original_name',
       'popularity', 'poster_path', 'type', 'status', 'tagline', 'genres',
       'languages', 'networks', 'origin_country', 'spoken_languages',
       'production_companies', 'production_countries', 'episode_run_time',
       'cleaned_overview', 'Action & Adventure', 'Animation', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'History', 'Kids', 'Music',
       'Musical', 'Mystery', 'News', 'Reality', 'Romance', 'Sci-Fi & Fantasy',
       'Soap', 'Talk', 'Unknown', 'War & Politics', 'Western',
       'original-language_en', 'original-language_zh', 'original-language_ja',
       'original-language_ko', 'original-language_de', 'original-language_fr',
       'original-language_es', 'original-language_pt', 'original-language_ru',
       'original-languag

In [22]:
# Creating a list of the top 11 values in the networks column
top_10_networks = df['networks'].value_counts(ascending = False).head(10).index.tolist()
top_10_networks

['BBC One',
 'YouTube',
 'Netflix',
 'ITV1',
 'BBC Two',
 'ABC',
 'NBC',
 'TVB Jade',
 'CBS',
 'Channel 4']

In [23]:
# Creating one-hot encoded columns for networks
for i in top_10_networks:
    name = 'networks_' + i
    df[name] = np.where(df['networks'] == i, 1, 0)

In [24]:
# Removing the networks column from df and encode_cols
df.drop(columns = ['networks'], inplace = True)
encode_cols.remove('networks')

df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       'last_air_date', 'homepage', 'in_production', 'original_name',
       'popularity', 'poster_path', 'type', 'status', 'tagline', 'genres',
       'languages', 'origin_country', 'spoken_languages',
       'production_companies', 'production_countries', 'episode_run_time',
       'cleaned_overview', 'Action & Adventure', 'Animation', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'History', 'Kids', 'Music',
       'Musical', 'Mystery', 'News', 'Reality', 'Romance', 'Sci-Fi & Fantasy',
       'Soap', 'Talk', 'Unknown', 'War & Politics', 'Western',
       'original-language_en', 'original-language_zh', 'original-language_ja',
       'original-language_ko', 'original-language_de', 'original-language_fr',
       'original-language_es', 'original-language_pt', 'original-language_ru',
       'original-language_nl', 'crea

In [25]:
# Creating a list of the top 10 values in the origin_country column
top_10_origin_country = df['origin_country'].value_counts(ascending = False).head(10).index.tolist()
top_10_origin_country

['US', 'JP', 'GB', 'CN', 'DE', 'KR', 'CA', 'FR', 'AU', 'BR']

In [26]:
'''
I was getting a warning message when running the code due to performance issues, so I followed
the warnings sudgested approach to fix the issue by using the pd.concat function.
'''

one_hot_encoded_origin_country = pd.DataFrame() # start with an empty dataframe

for i in top_10_origin_country:
    one_hot_encoded_origin_country['origin-country_' + i] = np.where(df['origin_country'] == i, 1, 0)

# the we can concatenate the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_encoded_origin_country], axis=1)

In [27]:
# Removing the origin_country column from df and encode_cols
df.drop(columns = ['origin_country'], inplace = True)
encode_cols.remove('origin_country')

df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       'last_air_date', 'homepage', 'in_production', 'original_name',
       'popularity', 'poster_path', 'type', 'status', 'tagline', 'genres',
       'languages', 'spoken_languages', 'production_companies',
       'production_countries', 'episode_run_time', 'cleaned_overview',
       'Action & Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'History', 'Kids', 'Music', 'Musical', 'Mystery',
       'News', 'Reality', 'Romance', 'Sci-Fi & Fantasy', 'Soap', 'Talk',
       'Unknown', 'War & Politics', 'Western', 'original-language_en',
       'original-language_zh', 'original-language_ja', 'original-language_ko',
       'original-language_de', 'original-language_fr', 'original-language_es',
       'original-language_pt', 'original-language_ru', 'original-language_nl',
       'created-by_Shotaro Ish

In [28]:
# Creating a list of the top 10 values in the production_companies column
top_10_production_companies = df['production_companies'].value_counts(ascending = False).head(10).index.tolist()
top_10_production_companies

['TVB',
 'BBC',
 'Estúdios Globo',
 'NHK',
 'DR TV',
 'TV 2',
 'Televisa',
 'GMA Entertainment Group',
 'Česká televize',
 'ATV Enterprises Limited']

In [29]:
'''
did the same changes to this code cell as well to fix the warning message
'''

one_hot_encoded_production_companies = pd.DataFrame() # start with an empty dataframe

for i in top_10_origin_country:
    one_hot_encoded_production_companies['production-companies_' + i] = np.where(df['production_companies'] == i, 1, 0)

# the we can concatenate the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_encoded_production_companies], axis=1)

In [30]:
# Removing the production_companies column from df and encode_cols
df.drop(columns = ['production_companies'], inplace = True)
encode_cols.remove('production_companies')

df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       'last_air_date', 'homepage', 'in_production', 'original_name',
       'popularity', 'poster_path', 'type', 'status', 'tagline', 'genres',
       'languages', 'spoken_languages', 'production_countries',
       'episode_run_time', 'cleaned_overview', 'Action & Adventure',
       'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
       'History', 'Kids', 'Music', 'Musical', 'Mystery', 'News', 'Reality',
       'Romance', 'Sci-Fi & Fantasy', 'Soap', 'Talk', 'Unknown',
       'War & Politics', 'Western', 'original-language_en',
       'original-language_zh', 'original-language_ja', 'original-language_ko',
       'original-language_de', 'original-language_fr', 'original-language_es',
       'original-language_pt', 'original-language_ru', 'original-language_nl',
       'created-by_Shotaro Ishinomori', 'created-by_Jo

In [31]:
# One-hot encoding remaining columns
for colname in encode_cols:
    df_encoded = pd.get_dummies(df[colname], prefix=colname+'')
    df = df.join(df_encoded)

df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       ...
       'type_Reality', 'type_Scripted', 'type_Talk Show', 'type_Video',
       'status_Canceled', 'status_Ended', 'status_In Production',
       'status_Pilot', 'status_Planned', 'status_Returning Series'],
      dtype='object', length=109)

In [32]:
# Removing remaining original cols from df
df.drop(columns=encode_cols,axis=1,inplace=True)

df.columns

Index(['id', 'name', 'number_of_seasons', 'number_of_episodes', 'vote_count',
       'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date',
       ...
       'type_Reality', 'type_Scripted', 'type_Talk Show', 'type_Video',
       'status_Canceled', 'status_Ended', 'status_In Production',
       'status_Pilot', 'status_Planned', 'status_Returning Series'],
      dtype='object', length=107)

In [33]:
df.head(10)

Unnamed: 0,id,name,number_of_seasons,number_of_episodes,vote_count,vote_average,overview,adult,backdrop_path,first_air_date,...,type_Reality,type_Scripted,type_Talk Show,type_Video,status_Canceled,status_Ended,status_In Production,status_Pilot,status_Planned,status_Returning Series
0,1399.0,Game of Thrones,8.0,73.0,21857.0,8.442,Seven noble families fight for control of the ...,False,/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg,2011-04-17,...,False,True,False,False,False,True,False,False,False,False
1,71446.0,Money Heist,3.0,41.0,17836.0,8.257,"To carry out the biggest heist in history, a m...",False,/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg,2017-05-02,...,False,True,False,False,False,True,False,False,False,False
2,66732.0,Stranger Things,4.0,34.0,16161.0,8.624,"When a young boy vanishes, a small town uncove...",False,/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg,2016-07-15,...,False,True,False,False,False,False,False,False,False,True
3,1402.0,The Walking Dead,11.0,177.0,15432.0,8.121,Sheriff's deputy Rick Grimes awakens from a co...,False,/x4salpjB11umlUOltfNvSSrjSXm.jpg,2010-10-31,...,False,True,False,False,False,True,False,False,False,False
4,63174.0,Lucifer,6.0,93.0,13870.0,8.486,"Bored and unhappy as the Lord of Hell, Lucifer...",False,/aDBRtunw49UF4XmqfyNuD9nlYIu.jpg,2016-01-25,...,False,True,False,False,False,True,False,False,False,False
5,69050.0,Riverdale,7.0,137.0,13180.0,8.479,"Set in the present, the series offers a bold, ...",False,/soQgquPkLmUu9eKLJJzuA4KZDyi.jpg,2017-01-26,...,False,True,False,False,False,True,False,False,False,False
6,93405.0,Squid Game,2.0,9.0,13053.0,7.831,Hundreds of cash-strapped players accept a str...,False,/2meX1nMdScFOoV4370rqHWKmXhY.jpg,2021-09-17,...,False,True,False,False,False,False,False,False,False,True
7,1396.0,Breaking Bad,5.0,62.0,12398.0,8.89,"When Walter White, a New Mexico chemistry teac...",False,/tsRy63Mu5cu8etL1X7ZLyf7UP1M.jpg,2008-01-20,...,False,True,False,False,False,True,False,False,False,False
8,71712.0,The Good Doctor,6.0,116.0,11768.0,8.503,"Shaun Murphy, a young surgeon with autism and ...",False,/xXRsKNJHTOGrs5wfYAxkbM2RiyT.jpg,2017-09-25,...,False,True,False,False,False,False,False,False,False,True
9,85271.0,WandaVision,1.0,9.0,11308.0,8.3,Wanda Maximoff and Vision—two super-powered be...,False,/lOr9NKxh4vMweufMOUDJjJhCRHW.jpg,2021-01-15,...,False,False,False,False,False,True,False,False,False,False


In [34]:
# Converting first_air_date and last_air_date columns to DateTime referring to https://www.youtube.com/watch?v=f7LODKIjtaA
df['first_air_date'] = pd.to_datetime(df['first_air_date'], format = '%Y-%m-%d')
df['last_air_date'] = pd.to_datetime(df['last_air_date'], format = '%Y-%m-%d' )
print(df['first_air_date'].dtypes)
print(df['last_air_date'].dtypes)

datetime64[ns]
datetime64[ns]


In [35]:
print(df['first_air_date'])

0        2011-04-17
1        2017-05-02
2        2016-07-15
3        2010-10-31
4        2016-01-25
            ...    
168412          NaT
168416          NaT
168418          NaT
168419          NaT
168420          NaT
Name: first_air_date, Length: 168593, dtype: datetime64[ns]


In [36]:
print(df['last_air_date'])

0        2019-05-19
1        2021-12-03
2        2022-07-01
3        2022-11-20
4        2021-09-10
            ...    
168412          NaT
168416          NaT
168418          NaT
168419          NaT
168420          NaT
Name: last_air_date, Length: 168593, dtype: datetime64[ns]


In [37]:
# Creating a function to create seasons for each month
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'WINTER'
    elif month in [3, 4, 5]:
        return 'SPRING'
    elif month in [6, 7, 8]:
        return 'SUMMER'
    elif month in [9, 10, 11]:
        return 'FALL'
    else:
        return 'UNKNOWN'

In [38]:
# Creating a column with the seasons for first_air_date
df['first_air_date_season'] = df['first_air_date'].apply(get_season)
df['first_air_date_season']

0          SPRING
1          SPRING
2          SUMMER
3            FALL
4          WINTER
           ...   
168412    UNKNOWN
168416    UNKNOWN
168418    UNKNOWN
168419    UNKNOWN
168420    UNKNOWN
Name: first_air_date_season, Length: 168593, dtype: object

In [39]:
# Creating columns for first_air_date_season as boolean values
df['first_air_date_winter'] = df['first_air_date_season'] == 'WINTER'
df['first_air_date_spring'] = df['first_air_date_season'] == 'SPRING'
df['first_air_date_summer'] = df['first_air_date_season'] == 'SUMMER'
df['first_air_date_fall'] = df['first_air_date_season'] == 'FALL'

In [40]:
# Repeating same process for last_air_date
df['last_air_date_season'] = df['last_air_date'].apply(get_season)
df['last_air_date_season']

0          SPRING
1          WINTER
2          SUMMER
3            FALL
4            FALL
           ...   
168412    UNKNOWN
168416    UNKNOWN
168418    UNKNOWN
168419    UNKNOWN
168420    UNKNOWN
Name: last_air_date_season, Length: 168593, dtype: object

In [41]:
df['last_air_date_winter'] = df['last_air_date_season'] == 'WINTER'
df['last_air_date_spring'] = df['last_air_date_season'] == 'SPRING'
df['last_air_date_summer'] = df['last_air_date_season'] == 'SUMMER'
df['last_air_date_fall'] = df['last_air_date_season'] == 'FALL'

In [42]:
# save data changed during one-hot encoding to csv file
df.to_csv('TMDB_tv_dataset_v3.csv', index=False)

In [43]:
# Inspecting columns to ensure process was done correctly
df.head(5)

Unnamed: 0,id,name,number_of_seasons,number_of_episodes,vote_count,vote_average,overview,adult,backdrop_path,first_air_date,...,first_air_date_season,first_air_date_winter,first_air_date_spring,first_air_date_summer,first_air_date_fall,last_air_date_season,last_air_date_winter,last_air_date_spring,last_air_date_summer,last_air_date_fall
0,1399.0,Game of Thrones,8.0,73.0,21857.0,8.442,Seven noble families fight for control of the ...,False,/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg,2011-04-17,...,SPRING,False,True,False,False,SPRING,False,True,False,False
1,71446.0,Money Heist,3.0,41.0,17836.0,8.257,"To carry out the biggest heist in history, a m...",False,/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg,2017-05-02,...,SPRING,False,True,False,False,WINTER,True,False,False,False
2,66732.0,Stranger Things,4.0,34.0,16161.0,8.624,"When a young boy vanishes, a small town uncove...",False,/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg,2016-07-15,...,SUMMER,False,False,True,False,SUMMER,False,False,True,False
3,1402.0,The Walking Dead,11.0,177.0,15432.0,8.121,Sheriff's deputy Rick Grimes awakens from a co...,False,/x4salpjB11umlUOltfNvSSrjSXm.jpg,2010-10-31,...,FALL,False,False,False,True,FALL,False,False,False,True
4,63174.0,Lucifer,6.0,93.0,13870.0,8.486,"Bored and unhappy as the Lord of Hell, Lucifer...",False,/aDBRtunw49UF4XmqfyNuD9nlYIu.jpg,2016-01-25,...,WINTER,True,False,False,False,FALL,False,False,False,True


In [44]:
df.to_csv("TMDB_tv_dataset_v3.csv", index=False)