Download and Load Netflix Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('ViewingActivity.csv')

In [3]:
df.shape

(7381, 10)

In [4]:
df.head()

Unnamed: 0,Profile Name,Start Time,Duration,Attributes,Title,Supplemental Video Type,Device Type,Bookmark,Latest Bookmark,Country
0,Marta,2021-06-19 10:11:11,00:14:52,Autoplayed: user action: User_Interaction;,Park Jurajski: Obóz Kredowy: Sezon 2: Sztuka r...,,Samsung CE 2020 Nike-L UHD TV (2G) Smart TV,00:23:59,00:23:59,PL (Poland)
1,Marta,2021-06-19 09:58:20,00:12:06,,Alvinnn!!! I wiewiórki: Sezon 1: Ciasto / Kto ...,,Samsung CE 2020 Nike-L UHD TV (2G) Smart TV,00:22:35,00:22:35,PL (Poland)
2,Marta,2021-06-19 09:48:32,00:08:27,,Alvinnn!!! I wiewiórki: Sezon 2: Szalony weeke...,,Samsung CE 2020 Nike-L UHD TV (2G) Smart TV,00:22:36,00:22:36,PL (Poland)
3,Marta,2021-06-19 09:19:38,00:26:54,,Nasza planeta: Sezon 1: Dżungle (Odcinek 3),,Samsung CE 2020 Nike-L UHD TV (2G) Smart TV,00:49:56,00:49:56,PL (Poland)
4,Marta,2021-06-18 21:16:25,00:21:07,,Park Jurajski: Obóz Kredowy: Sezon 2: Nadajnik...,,Samsung CE 2020 Nike-L UHD TV (2G) Smart TV,00:21:52,00:21:52,PL (Poland)


Dropping Unnecessary Columns

In [5]:
df = df.drop(['Profile Name','Attributes','Supplemental Video Type','Device Type','Bookmark','Latest Bookmark'], axis=1)
df.head()

Unnamed: 0,Start Time,Duration,Title,Country
0,2021-06-19 10:11:11,00:14:52,Park Jurajski: Obóz Kredowy: Sezon 2: Sztuka r...,PL (Poland)
1,2021-06-19 09:58:20,00:12:06,Alvinnn!!! I wiewiórki: Sezon 1: Ciasto / Kto ...,PL (Poland)
2,2021-06-19 09:48:32,00:08:27,Alvinnn!!! I wiewiórki: Sezon 2: Szalony weeke...,PL (Poland)
3,2021-06-19 09:19:38,00:26:54,Nasza planeta: Sezon 1: Dżungle (Odcinek 3),PL (Poland)
4,2021-06-18 21:16:25,00:21:07,Park Jurajski: Obóz Kredowy: Sezon 2: Nadajnik...,PL (Poland)


Converting Strings to Datetime and Timedelta in Pandas

In [6]:
df.dtypes

Start Time    object
Duration      object
Title         object
Country       object
dtype: object

In [7]:
df['Start Time'] = pd.to_datetime(df['Start Time'], utc=True)
df.dtypes

Start Time    datetime64[ns, UTC]
Duration                   object
Title                      object
Country                    object
dtype: object

In [8]:
# I'm changing the Start Time column into the dataframe's index.
df = df.set_index('Start Time')

In [9]:
# I'm converting from UTC timezone to warsaw time.
df.index = df.index.tz_convert('Europe/Warsaw')

In [10]:
# I'm reseting the index so that Start Time becomes a column again.
df = df.reset_index()

In [11]:
# double-check that it worked.
df.head()

Unnamed: 0,Start Time,Duration,Title,Country
0,2021-06-19 12:11:11+02:00,00:14:52,Park Jurajski: Obóz Kredowy: Sezon 2: Sztuka r...,PL (Poland)
1,2021-06-19 11:58:20+02:00,00:12:06,Alvinnn!!! I wiewiórki: Sezon 1: Ciasto / Kto ...,PL (Poland)
2,2021-06-19 11:48:32+02:00,00:08:27,Alvinnn!!! I wiewiórki: Sezon 2: Szalony weeke...,PL (Poland)
3,2021-06-19 11:19:38+02:00,00:26:54,Nasza planeta: Sezon 1: Dżungle (Odcinek 3),PL (Poland)
4,2021-06-18 23:16:25+02:00,00:21:07,Park Jurajski: Obóz Kredowy: Sezon 2: Nadajnik...,PL (Poland)


In [12]:
df['Duration'] = pd.to_timedelta(df['Duration'])
df.dtypes

Start Time    datetime64[ns, Europe/Warsaw]
Duration                    timedelta64[ns]
Title                                object
Country                              object
dtype: object

Filtering Strings by Substring in pandas Using str.contains

In [13]:
# create a new dataframe called park_jurajski that takes from df
# only the rows in which the Title column contains 'Park Jurajski: Obóz Kredowy'
park_jurajski = df[df['Title'].str.contains('Park Jurajski: Obóz Kredowy', regex=False)]

In [14]:
park_jurajski.sample(20)

Unnamed: 0,Start Time,Duration,Title,Country
342,2021-05-21 22:23:59+02:00,0 days 00:22:05,Park Jurajski: Obóz Kredowy: Sezon 3: Bezpiecz...,PL (Poland)
0,2021-06-19 12:11:11+02:00,0 days 00:14:52,Park Jurajski: Obóz Kredowy: Sezon 2: Sztuka r...,PL (Poland)
412,2021-05-07 08:24:25+02:00,0 days 00:01:27,Sezon 2 (zwiastun): Park Jurajski: Obóz Kredowy,PL (Poland)
305,2021-05-25 19:55:14+02:00,0 days 00:22:49,Park Jurajski: Obóz Kredowy: Sezon 3: Zgodnie ...,PL (Poland)
307,2021-05-25 19:45:29+02:00,0 days 00:00:01,Park Jurajski: Obóz Kredowy: Sezon 3: Zgodnie ...,PL (Poland)
49,2021-06-17 11:04:38+02:00,0 days 00:00:29,Sezon 2 (zwiastun): Park Jurajski: Obóz Kredowy,PL (Poland)
796,2021-03-26 19:16:04+01:00,0 days 00:22:49,Park Jurajski: Obóz Kredowy: Sezon 2: Jeden kr...,PL (Poland)
311,2021-05-25 19:44:46+02:00,0 days 00:00:06,Sezon 1 (zwiastun): Park Jurajski: Obóz Kredowy,PL (Poland)
1496,2021-02-14 14:31:00+01:00,0 days 00:22:20,Park Jurajski: Obóz Kredowy: Sezon 2: Wodopój ...,PL (Poland)
788,2021-03-27 08:46:30+01:00,0 days 00:00:05,Sezon 1 (Klip nr 2): Park Jurajski: Obóz Kredowy,PL (Poland)


In [15]:
park_jurajski.shape

(118, 4)

Filtering Out Short Durations Using Timedelta

In [16]:
# filter park_jurajski dataframe by limiting it to only rows where the Duration value is greater than three minute
# this should effectively count the watchtime for partially watched episodes, while filtering out trailers
park_jurajski = park_jurajski[(park_jurajski['Duration'] > '0 days 00:03:00')]

In [17]:
park_jurajski.sample(20)

Unnamed: 0,Start Time,Duration,Title,Country
1460,2021-02-15 20:36:14+01:00,0 days 00:21:49,Park Jurajski: Obóz Kredowy: Sezon 2: Teoria c...,PL (Poland)
1526,2021-02-12 21:51:31+01:00,0 days 00:22:03,Park Jurajski: Obóz Kredowy: Sezon 1: Czasem w...,PL (Poland)
1524,2021-02-12 22:36:14+01:00,0 days 00:22:04,Park Jurajski: Obóz Kredowy: Sezon 1: Witamy w...,PL (Poland)
1527,2021-02-12 21:29:27+01:00,0 days 00:22:03,Park Jurajski: Obóz Kredowy: Sezon 1: Spęd sta...,PL (Poland)
1501,2021-02-13 23:54:18+01:00,0 days 00:22:01,Park Jurajski: Obóz Kredowy: Sezon 2: Sztuka r...,PL (Poland)
20,2021-06-17 16:11:29+02:00,0 days 00:22:00,Park Jurajski: Obóz Kredowy: Sezon 1: Czasem w...,PL (Poland)
339,2021-05-22 19:06:56+02:00,0 days 00:22:03,Park Jurajski: Obóz Kredowy: Sezon 3: Spryciar...,PL (Poland)
211,2021-06-02 18:05:01+02:00,0 days 00:22:11,Park Jurajski: Obóz Kredowy: Sezon 3: Ucieczka...,PL (Poland)
1528,2021-02-12 21:06:21+01:00,0 days 00:22:48,Park Jurajski: Obóz Kredowy: Sezon 1: Tajemnic...,PL (Poland)
798,2021-03-26 18:37:15+01:00,0 days 00:22:16,Park Jurajski: Obóz Kredowy: Sezon 1: Obóz Kre...,PL (Poland)


In [18]:
park_jurajski.shape

(56, 4)