In [1]:
import numpy as np
import pandas as pd

# loading the data

In [2]:
df_netflix = pd.read_csv('/mnt/c/Users/arina/Desktop/practice/netflix project/netflix_originals.csv')

In [3]:
df_netflix.head()

Unnamed: 0,titles,years,genres,imdb,runtime,description,stars,votes,type,original
0,Zumbo's Just Desserts,2016,Reality-TV,6.9,52 min,Amateur Australian chefs compete to impress pa...,"Gigi Falanga, Rachel Khoo, Adriano Zumbo",1779,TV Show,Netflix
1,Zona Rosa,2019,Comedy,6.0,,Add a Plot,"Ray Contreras, Pablo Morán, Manu Nna, Ana Juli...",33,TV Show,Netflix
2,Young Wallander,2020,"Crime, Drama, Mystery",6.7,,Follow recently graduated police officer Kurt ...,"Adam Pålsson, Leanne Best, Richard Dillane, El...",5419,TV Show,Netflix
3,You vs. Wild,2019,"Adventure, Reality-TV",6.7,20 min,"In this interactive series, you'll make key de...",Bear Grylls,1977,TV Show,Netflix
4,You,2018,"Crime, Drama, Romance",7.8,45 min,"A dangerously charming, intensely obsessive yo...","Penn Badgley, Victoria Pedretti, Ambyr Childer...",134932,TV Show,Netflix


In [4]:
df_netflix.dtypes

titles          object
years            int64
genres          object
imdb           float64
runtime         object
description     object
stars           object
votes           object
type            object
original        object
dtype: object

In [5]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1517 entries, 0 to 1516
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   titles       1517 non-null   object 
 1   years        1517 non-null   int64  
 2   genres       1516 non-null   object 
 3   imdb         1512 non-null   float64
 4   runtime      1276 non-null   object 
 5   description  1517 non-null   object 
 6   stars        1489 non-null   object 
 7   votes        1516 non-null   object 
 8   type         1517 non-null   object 
 9   original     1517 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 118.6+ KB


## overall check

In [6]:
# Convert 'runtime' column to numeric (in minutes) by removing ' min' suffix and coercing to integer
df_netflix["runtime"] = df_netflix["runtime"].str.replace(' min', '', regex=False)
df_netflix['runtime'] = pd.to_numeric(df_netflix['runtime'], errors='coerce').astype('Int64')


In [7]:
df_netflix['votes'] = pd.to_numeric(df_netflix['votes'], errors='coerce')

In [8]:
#total number of duplicate rows
df_netflix.duplicated().sum()

np.int64(2)

In [9]:
df_netflix[df_netflix.duplicated()]

Unnamed: 0,titles,years,genres,imdb,runtime,description,stars,votes,type,original
368,Kipo and the Age of Wonderbeasts,2020,"Animation, Action, Adventure",8.4,24,A girl explores the possibilities in a post-ap...,"Karen Fukuhara, Sydney Mikayla, Dee Bradley Ba...",,TV Show,Netflix
1199,Jo Koy: In His Elements,2020,Comedy,5.1,55,Jo Koy returns to the Philippines to show off ...,"Michael McKay, A, Fateeha, Joey Guila, Jo Koy",350.0,TV Show,Netflix


In [10]:
#delete the duplicate rows
df_netflix.drop_duplicates(inplace=True)

In [11]:
# pandas automatic dtype fixer
df_netflix = df_netflix.convert_dtypes()
df_netflix.dtypes

titles         string[python]
years                   Int64
genres         string[python]
imdb                  Float64
runtime                 Int64
description    string[python]
stars          string[python]
votes                   Int64
type           string[python]
original       string[python]
dtype: object

## dealing with null values

In [12]:
#percentage of null values
((df_netflix.isnull().sum()/len(df_netflix)) * 100).sort_values(ascending=False)


votes          64.224422
runtime        15.907591
stars           1.848185
imdb            0.330033
genres          0.066007
titles          0.000000
years           0.000000
description     0.000000
type            0.000000
original        0.000000
dtype: float64

In [13]:
# Display the count of unique values in the 'type' column (e.g., number of movies vs. TV shows)
df_netflix["type"].value_counts()

type
TV Show    1010
Movie       505
Name: count, dtype: Int64

In [14]:
# Filter out rows where 'imdb' is greater than 10 or less than 0
df_netflix = df_netflix[~((df_netflix['imdb'] > 10) | (df_netflix['imdb'] < 0))]

In [15]:
# Calculate and display the median runtime for each content type (Movie, TV Show)
median_runtime_per_type = df_netflix.groupby('type')["runtime"].median()
median_runtime_per_type

type
Movie      96.0
TV Show    50.0
Name: runtime, dtype: Float64

In [16]:
# Fill missing 'runtime' values based on the median runtime of their respective content 'type'
df_netflix["runtime"] = df_netflix["runtime"].fillna(df_netflix["type"].map(median_runtime_per_type))


In [17]:
#percentage of nuul values
((df_netflix.isnull().sum() / len(df_netflix)) *100).sort_values(ascending=False)

votes          64.304636
stars           1.788079
genres          0.066225
titles          0.000000
years           0.000000
imdb            0.000000
description     0.000000
runtime         0.000000
type            0.000000
original        0.000000
dtype: float64

In [18]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1510 entries, 0 to 1516
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   titles       1510 non-null   string 
 1   years        1510 non-null   Int64  
 2   genres       1509 non-null   string 
 3   imdb         1510 non-null   Float64
 4   runtime      1510 non-null   Int64  
 5   description  1510 non-null   string 
 6   stars        1483 non-null   string 
 7   votes        539 non-null    Int64  
 8   type         1510 non-null   string 
 9   original     1510 non-null   string 
dtypes: Float64(1), Int64(3), string(6)
memory usage: 135.7 KB


In [19]:

df_netflix.to_excel("clean_netflix_data.xlsx",index=False)