In [None]:
""" Leitura dos dados de Filmes

O arquivo contem os seguintes dados:

    - show_id:  identificador único para filme ou seriado
    - type: Identifica se é um filme ou seriado
    - title: titulo do filme ou seriado
    - director: nome do diretor do filme ou seriado
    - cast: (elenco) nome dos atores envolvidos
    - country:  nome do pais onde o filme ou seriado foi produzido ou apresentado
    - date_added: data em que o filme ou seriado foi adicionado a Netflix
    - release_year: ano de llançamento do filme ou seriado
    - rating: orientação parental
    - duration: duração em minutos ou número de temporadas
    - listed_in: categorias nas quais o filme ou seriado foi listado
    - description: a descrição do summary"""

In [None]:
# importing pandas and creating the dataset
import pandas as pd

filename = 'dataset/netflix_titles.csv'
df = pd.read_csv(filename, parse_dates=['date_added'])
df.head()



title
Dick Johnson Is Dead                     1
Ip Man 2                                 1
Hannibal Buress: Comedy Camisado         1
Turbo FAST                               1
Masha's Tales                            1
                                        ..
Love for Sale 2                          1
ROAD TO ROMA                             1
Good Time                                1
Captain Underpants Epic Choice-o-Rama    1
Zubaan                                   1
Name: count, Length: 8807, dtype: int64

In [3]:
# Verificação da qualidade dos dados
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [None]:
# Checking for null data
df.isna().sum()


show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [None]:
# Checking for duplicated data
df[df.duplicated()]

# if we had duplicates we should use:
# df.drop_duplicates

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description


In [None]:
# lets deal with the 2634 missing directors
df['director'].value_counts()

# the director with most titles has 19, therefor we have at least 400 different directors 

director
Rajiv Chilaka                     19
Raúl Campos, Jan Suter            18
Marcus Raboy                      16
Suhas Kadav                       16
Jay Karas                         14
                                  ..
Raymie Muzquiz, Stu Livingston     1
Joe Menendez                       1
Eric Bross                         1
Will Eisenberg                     1
Mozez Singh                        1
Name: count, Length: 4528, dtype: int64

In [None]:
# I chose to use scikit-learn to prepare the database and XGBoost to predict the director
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import joblib

# the idea is to have a maximum number of directors and a category "others"
TOP_N_DIRECTORS = 100  # Number of top directors to predict specifically
MIN_SAMPLES = 5  # Minimum films a director must have to be included in TOP_N


# Clean director data
df.fillna({"director": 'UNKNOWN'}, inplace=True)

In [13]:
# Create target variable - keep top N directors, group others as 'OTHER'
director_counts = df['director'].value_counts()
top_directors = director_counts[director_counts >= MIN_SAMPLES].index[:TOP_N_DIRECTORS]
df['director_target'] = df['director'].apply(lambda x: x if x in top_directors else 'OTHER')

# Extract year from date_added if available
df['year_added'] = pd.to_datetime(df['date_added'], errors='coerce').dt.year

# Create features
features = df[['title', 'type', 'cast', 'country', 'release_year', 'rating', 'duration', 'listed_in']]
target = df['director_target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [18]:
# Text preprocessing for title
import pyarrow
from sklearn.preprocessing import StandardScaler
title_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=200, stop_words='english'))
])

# Categorical features preprocessing
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Numerical features preprocessing
numerical_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# Column transformer
preprocessor = ColumnTransformer([
    ('title', title_transformer, 'title'),
    ('text_features', TfidfVectorizer(max_features=100), 'listed_in'),
    ('cat', categorical_transformer, ['type', 'rating']),
    ('num', numerical_transformer, ['release_year', 'year_added'])
], remainder='drop')