In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
netflix_df = pd.read_csv("netflix_titles.csv")
netflix_df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
print(netflix_df["listed_in"]) 

0                                           Documentaries
1         International TV Shows, TV Dramas, TV Mysteries
2       Crime TV Shows, International TV Shows, TV Act...
3                                  Docuseries, Reality TV
4       International TV Shows, Romantic TV Shows, TV ...
                              ...                        
8802                       Cult Movies, Dramas, Thrillers
8803               Kids' TV, Korean TV Shows, TV Comedies
8804                              Comedies, Horror Movies
8805                   Children & Family Movies, Comedies
8806       Dramas, International Movies, Music & Musicals
Name: listed_in, Length: 8807, dtype: object


In [4]:
 #Only considering the 1st index
netflix_df['primary_genre'] = netflix_df['listed_in'].apply(lambda x: x.split(',')[0].strip())
netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,primary_genre
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Documentaries
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",International TV Shows
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,Crime TV Shows
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",Docuseries
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,International TV Shows


In [5]:
#for getting the count of nan values in each column
total_nans = netflix_df.isna().sum()
print(total_nans)

show_id             0
type                0
title               0
director         2634
cast              825
country           831
date_added         10
release_year        0
rating              4
duration            3
listed_in           0
description         0
primary_genre       0
dtype: int64


In [6]:
# Check for duplicates
duplicates = netflix_df.duplicated()

print(f"Number of duplicate rows: {duplicates.sum()}")



Number of duplicate rows: 0


In [7]:
netflix_df.drop('duration', axis=1, inplace=True) #because we already have type column to now if its a movies or series

# Filling the nan values for director,cast,country with Unknown
netflix_df['director'].fillna('Unknown', inplace=True)
netflix_df['cast'].fillna('Unknown', inplace=True)
netflix_df['country'].fillna('Unknown', inplace=True)

# Filling the nan values of date_added and rating with the mode
netflix_df['date_added'].fillna(netflix_df['date_added'].mode()[0], inplace=True)
netflix_df['rating'].fillna(netflix_df['rating'].mode()[0], inplace=True)


In [8]:
X = netflix_df[['type', 'rating', 'description', 'cast', 'director', 'release_year']]  #Features
y = netflix_df['primary_genre'] 


preprocessing_data = ColumnTransformer(
    transformers=[
        ('desc_tfidf', TfidfVectorizer(stop_words='english',max_features=1000), 'description'),
        ('cast_tfidf', TfidfVectorizer(stop_words='english', max_features=1000), 'cast'), 
        ('director_tfidf', TfidfVectorizer(stop_words='english', max_features=1000), 'director'),  
        ('year', 'passthrough', ['release_year']),  
        ('onehot', OneHotEncoder(handle_unknown='ignore'), ['type', 'rating']),
    ],
    remainder='drop' 
)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
rfc_model_pipeline = Pipeline([
    ('preprocessor', preprocessing_data),
    ('rfc_model', RandomForestClassifier(n_estimators=300, max_depth=None, min_samples_split=10, random_state=42))
])

rfc_model_pipeline.fit(X_train, y_train)
rfc_model_pred = rfc_model_pipeline.predict(X_test)


In [11]:
svm_model_pipeline = Pipeline([
    ('preprocessor', preprocessing_data),
    ('svm_model', SVC(kernel='linear',C=1.0, gamma=100, random_state=42))
])

svm_model_pipeline.fit(X_train, y_train)
svm_model_pred = svm_model_pipeline.predict(X_test)


In [12]:
print("Random Forest Classifier Results:")
print(classification_report(y_test, rfc_model_pred, zero_division=0))


print("SVM Results:")
print(classification_report(y_test, svm_model_pred, zero_division=0))


Random Forest Classifier Results:
                              precision    recall  f1-score   support

          Action & Adventure       0.71      0.42      0.53       172
              Anime Features       0.00      0.00      0.00         3
                Anime Series       1.00      0.54      0.70        39
            British TV Shows       0.50      0.21      0.30        52
    Children & Family Movies       0.75      0.82      0.78       122
           Classic & Cult TV       0.00      0.00      0.00         3
              Classic Movies       0.75      0.30      0.43        20
                    Comedies       0.60      0.36      0.45       231
              Crime TV Shows       0.74      0.54      0.62        89
                 Cult Movies       0.00      0.00      0.00         6
               Documentaries       0.89      0.78      0.83       151
                  Docuseries       0.55      0.53      0.54        51
                      Dramas       0.44      0.83      