In [None]:
""" Leitura dos dados de Filmes

O arquivo contem os seguintes dados:

    - show_id:  identificador único para filme ou seriado
    - type: Identifica se é um filme ou seriado
    - title: titulo do filme ou seriado
    - director: nome do diretor do filme ou seriado
    - cast: (elenco) nome dos atores envolvidos
    - country:  nome do pais onde o filme ou seriado foi produzido ou apresentado
    - date_added: data em que o filme ou seriado foi adicionado a Netflix
    - release_year: ano de llançamento do filme ou seriado
    - rating: orientação parental
    - duration: duração em minutos ou número de temporadas
    - listed_in: categorias nas quais o filme ou seriado foi listado
    - description: a descrição do summary"""

In [2]:
# importing pandas and creating the dataset
import pandas as pd

filename = 'dataset/netflix_titles.csv'
df = pd.read_csv(filename, parse_dates=['date_added'])
df.head()



Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
# Verificação da qualidade dos dados
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [4]:
# Checking for null data
df.isna().sum()


show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [5]:
# Checking for duplicated data
df[df.duplicated()]

# if we had duplicates we should use:
# df.drop_duplicates

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description


In [6]:
# lets deal with the 2634 missing directors
df['director'].value_counts()

# the director with most titles has 19, therefor we have at least 400 different directors 

director
Rajiv Chilaka                     19
Raúl Campos, Jan Suter            18
Marcus Raboy                      16
Suhas Kadav                       16
Jay Karas                         14
                                  ..
Raymie Muzquiz, Stu Livingston     1
Joe Menendez                       1
Eric Bross                         1
Will Eisenberg                     1
Mozez Singh                        1
Name: count, Length: 4528, dtype: int64

In [7]:
# I chose to use scikit-learn to prepare the database and XGBoost to predict the director
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import joblib

# the idea is to have a maximum number of directors and a category "others"
TOP_N_DIRECTORS = 100  # Number of top directors to predict specifically
MIN_SAMPLES = 5  # Minimum films a director must have to be included in TOP_N


# Clean director data
df.fillna({"director": 'UNKNOWN'}, inplace=True)

In [None]:
# Create target variable - keep top N directors, group others as 'OTHER'
"""
Why This Approach Works Well:
Simplifies Prediction: By focusing on top directors, we avoid trying to predict rare directors with insufficient data

Balances Classes: The 'OTHER' category prevents the model from being overwhelmed by thousands of rare directors

Rich Features: The selected features contain strong signals about likely directors

Proper Evaluation: The train-test split ensures we can measure real-world performance
"""
director_counts = df['director'].value_counts()
top_directors = director_counts[director_counts >= MIN_SAMPLES].index[:TOP_N_DIRECTORS]
df['director_target'] = df['director'].apply(lambda x: x if x in top_directors else 'OTHER')


# Create features
features = df[['title', 'type', 'cast', 'country', 'release_year', 'rating', 'duration', 'listed_in']]
target = df['director_target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [17]:
# Text preprocessing for title
import pyarrow
from sklearn.preprocessing import StandardScaler
title_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=200, stop_words='english'))
])

# Categorical features preprocessing
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Numerical features preprocessing
numerical_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# Column transformer
preprocessor = ColumnTransformer([
    ('title', title_transformer, 'title'),
    ('text_features', TfidfVectorizer(max_features=100), 'listed_in'),
    ('cat', categorical_transformer, ['type', 'rating']),
    ('num', numerical_transformer, ['release_year'])
], remainder='drop')

df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,director_target,year_added
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",OTHER,2021.0
1,s2,TV Show,Blood & Water,UNKNOWN,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",UNKNOWN,2021.0
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,OTHER,2021.0
3,s4,TV Show,Jailbirds New Orleans,UNKNOWN,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",UNKNOWN,2021.0
4,s5,TV Show,Kota Factory,UNKNOWN,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,UNKNOWN,2021.0


In [18]:
# Full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        objective='multi:softprob',
        num_class=TOP_N_DIRECTORS + 1,  # +1 for 'OTHER' category
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        early_stopping_rounds=10,
        eval_metric='mlogloss'
    ))
])
# First transform X_test using the unfitted preprocessor
# We need to clone and fit it first for evaluation set
from sklearn.base import clone

# Create a separate fitted preprocessor for eval_set
preprocessor_for_eval = clone(preprocessor).fit(X_train)
X_test_preprocessed = preprocessor_for_eval.transform(X_test)

# Now train with proper eval_set
pipeline.fit(
    X_train, 
    y_train,
    classifier__eval_set=[(X_test_preprocessed, y_test)]
)

# Why This Setup Works Well
# End-to-End Processing: Handles everything from raw data to predictions

# Prevents Data Leaks: Preprocessing is contained within the pipeline

# Optimized Training: Early stopping finds the best model automatically

# Production Ready: Can be saved and deployed as one unit

# Output: A trained model that can predict directors for new Netflix titles while:

# Handling text, categories, and numbers appropriately

# Providing probabilistic predictions

# Being robust to overfitting


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93], got ['Andy Tennant' 'Anees Bazmee' 'Angga Dwimas Sasongko'
 'Antoinette Jadaone' 'Anurag Kashyap' 'Ashutosh Gowariker' 'Ava DuVernay'
 'Cathy Garcia-Molina' 'Clint Eastwood' 'David Batty' 'David Dhawan'
 'David Fincher' 'Detlev Buck' 'Don Michael Paul' 'Fernando Ayllón'
 'Hakan Algül' 'Hanung Bramantyo' 'Hidenori Inoue' 'Imtiaz Ali'
 'Indra Kumar' 'Jay Chapman' 'Jay Karas' 'Jay Roach' 'Joey So'
 'John G. Avildsen' 'Johnnie To' 'Justin G. Dyck' 'Kevin Smith'
 'Khaled Marei' 'Kunle Afolayan' 'Kıvanç Baruönü' 'Lance Bangs'
 'Lasse Hallström' 'Leslie Small' 'Lynn Shelton' 'Madhur Bhandarkar'
 'Mae Czarina Cruz' 'Mahesh Manjrekar' 'Mani Ratnam' 'Manny Rodriguez'
 'Marcus Raboy' 'Martin Campbell' 'Martin Scorsese' 'Matt Askem' 'McG'
 'Michael Simon' 'Mike Clattenburg' 'Milan Luthria' 'Nagesh Kukunoor'
 'Niyi Akinmolayan' 'Noah Baumbach' 'OTHER' 'Oliver Stone' 'Omoni Oboli'
 'Ozan Açıktan' 'Paul Thomas Anderson' 'Prakash Jha' 'Prakash Satam'
 'Priyadarshan' 'Quentin Tarantino' 'Rajiv Chilaka' 'Rajiv Mehra'
 'Rajkumar Santoshi' 'Ram Gopal Varma' 'Raúl Campos, Jan Suter'
 'Riri Riza' 'Robert Luketic' 'Robert Rodriguez' 'Robert Vince'
 'Rocky Soraya' 'Rohit Shetty' 'Ron Howard' 'Ryan Polito' 'S.S. Rajamouli'
 'Sameh Abdulaziz' 'Shannon Hartman' 'Sooraj R. Barjatya' 'Spike Lee'
 'Steve Brill' 'Steven Soderbergh' 'Steven Spielberg' 'Suhas Kadav'
 'Thierry Donard' 'Toshiya Shinohara' 'Troy Miller' 'UNKNOWN'
 'Umesh Mehra' 'Vince Marcello' 'Vishal Bhardwaj' 'Vlad Yudin'
 'Wenn V. Deramas' 'Wilson Yip' 'Youssef Chahine' 'Yılmaz Erdoğan']