In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv('ViewingActivity.csv')
data.head(1)

In [None]:
# Get the total watch duration for each profile

data['Duration'] = pd.to_timedelta(data['Duration'])
profile_duration = data[['Profile Name', 'Duration']]
profile_duration = data.groupby('Profile Name')['Duration'].sum()
profile_duration.sort_values(ascending=False)

In [None]:
# Get the longest watched titles and their total duration
data['Title (Short)'] = data['Title'].str.split(":", expand = False).str[0]
most_watched = data.groupby(['Profile Name', 'Title (Short)'])['Duration'].sum()
most_watched = most_watched.sort_values(ascending=False)

most_watched.head(40)

In [None]:
profile_count = data['Profile Name'].value_counts()
plt.figure(figsize=(8,5))
plt.bar(profile_count.index, profile_count.values, color='teal')
plt.ylabel('Freq', fontsize=14)
plt.xlabel('Profile Names', fontsize=14)
plt.xticks(fontsize=11)
plt.title('Viewing Frequency of each Profile', fontsize=16)
plt.show()

In [None]:
country_count = data['Country'].value_counts()

plt.figure(figsize=(8,5))
plt.bar(country_count.index, country_count.values, color="crimson")
plt.xlabel("Countries", fontsize=14)
plt.ylabel("Frequency (log scale)", fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.title("Locations Used to Access Netflix", fontsize=16)
plt.yscale("log", basey=10) 
plt.show()

In [None]:
data.groupby(['Profile Name', 'Country']).size().unstack().plot(kind='bar', stacked=True)
plt.title('Locations Used to Access Netflix for Each Profile', fontsize=14)
plt.legend(loc=(1.05, 0))
plt.show()

In [None]:
device_count = data['Device Type'].value_counts()
plt.figure(figsize=(10,5))
plt.barh(device_count.index, device_count.values, color="mediumturquoise")
plt.xlabel("Freq", fontsize=14)
plt.ylabel("Devices used", fontsize=14)
plt.xticks(fontsize=10)
plt.title("Devices Used to Access Netflix", fontsize=16)
plt.gca().invert_yaxis() 
plt.show()

In [None]:
data.groupby(['Profile Name','Device Type']).size().unstack().plot(kind='bar', stacked=True, colormap="tab20b")
plt.title("Devices Used for Each Profile")
plt.legend(loc=(1.05, 0))
plt.show()

In [None]:
pd.set_option('display.max_rows', None)
data_search_history = pd.read_csv('SearchHistory.csv')
displayed_name = data_search_history.drop_duplicates(['Query Typed', 'Profile Name'])[['Query Typed', 'Profile Name']]
displayed_name = displayed_name.sort_values(by='Query Typed', ascending=False)
displayed_name.head(500)

In [None]:
start_times = data[['Profile Name', 'Start Time', 'Duration']]
start_times['Start Date'] = pd.to_datetime(start_times['Start Time']).dt.date
start_times.groupby(['Profile Name', 'Start Date'])['Start Date', 'Duration'].sum().sort_values(by = ['Duration'], ascending = False).head(20)

In [None]:
plt.figure(figsize=(10,5))
plt.barh(start_times['Start Date'], start_times['Duration'], color="blue")
plt.xlabel("Duration", fontsize=14)
plt.ylabel("Start Date", fontsize=14)
plt.xticks(fontsize=10)
plt.title("Watch duration per date", fontsize=16)
plt.gca().invert_yaxis() 
plt.show()