# Data Visualization

Netflix allows the downloading of personal behaviour data. This data can be analysed to gain insights into your viewing behaviour.
The purpose of this notebook is to practice some data visualisation on Netflix's personal behaviour data.

To download your data from Netflix, follow these steps:

Log in to your Netflix account and go to the "Account" page or just follow this address: https://www.netflix.com/account/security
In the "Access and Privacy" table there will be "Personal info access". Click on it
Click on the "Submit request" button at the bottom of the page. You will receive an email from Netflix to accept personal information downloading.
After some time you will be able to download your data. In my case, it took them half a day.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


try:
    df = pd.read_csv("netflix-report/CONTENT_INTERACTION/ViewingActivity.csv")
    print("Data loaded successfully!")
except FileNotFoundError:
    print(
        "Error: ViewingActivity.csv not found. Make sure the file is in the correct directory."
    )
    exit()

In [None]:
# The first 5 rows
print("First 5 rows of the DataFrame:")
print(df.head())

# Summary of the DataFrame, including data types and non-null values
print("\nDataFrame Info:")
print(df.info())

# Basic descriptive statistics for numerical columns
print("\nDescriptive Statistics:")
print(df.describe())

# The column names
print("\nColumn Names:")
print(df.columns)

In [None]:
df["Start Time"] = pd.to_datetime(df["Start Time"], utc=True)


def parse_duration(duration_str):
    parts = list(map(int, duration_str.split(":")))
    if len(parts) == 3:
        return parts[0] * 3600 + parts[1] * 60 + parts[2]
    elif len(parts) == 2:
        return parts[0] * 60 + parts[1]
    elif len(parts) == 1:
        return parts[0]
    return 0


df["Duration_seconds"] = df["Duration"].apply(parse_duration)
df["Duration_minutes"] = df["Duration_seconds"] / 60

df = df[df["Supplemental Video Type"].isna()]

In [None]:
total_watch_time_minutes = df["Duration_minutes"].sum()
print(
    f"Total watch time across all profiles: {total_watch_time_minutes / 60:.2f} hours"
)

In [None]:
watch_time_per_profile = (
    df.groupby("Profile Name")["Duration_minutes"].sum().sort_values(ascending=False)
)
watch_time_hours_rounded = (watch_time_per_profile / 60).round(2)

print("\nWatch time per profile (hours):")
print(watch_time_hours_rounded)

In [None]:
if "Duration" in df.columns and "Duration_minutes" not in df.columns:
    df["Duration_seconds"] = df["Duration"].apply(parse_duration)
    df["Duration_minutes"] = df["Duration_seconds"] / 60


# --- Calculate Top 10 Titles by Duration and Sessions ---
top_titles_by_duration = (
    df.groupby("Title")["Duration_minutes"].sum().sort_values(ascending=False).head(10)
)

top_titles_by_sessions = df["Title"].value_counts().head(10)


# --- Visualize Top 10 Titles by Total Watch Time ---
plt.figure(figsize=(12, 7))
sns.barplot(
    x=top_titles_by_duration.values / 60,
    y=top_titles_by_duration.index,
    palette="viridis",
)
plt.title("Most (re)watched films (hours)")
plt.xlabel("Total Watch Time (hours)")
plt.ylabel("Title")
plt.tight_layout()
plt.show()

In [None]:
df["hour_of_day"] = df["Start Time"].dt.hour
df["day_of_week"] = df["Start Time"].dt.day_name()
df["month"] = df["Start Time"].dt.month_name()
df["year"] = df["Start Time"].dt.year

day_order = [
    "Monday",
    "Tuesday",
    "Wednesday",
    "Thursday",
    "Friday",
    "Saturday",
    "Sunday",
]
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x="day_of_week", order=day_order)
plt.title("Netflix Viewing Sessions by day of week")
plt.xlabel("Day of week")
plt.ylabel("Number of sessions")
plt.show()

In [None]:
# Viewing sessions per month (across all years)
month_order = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]

# Viewing time per month and year
monthly_yearly_watch_time = (
    df.groupby(["year", "month"])["Duration_minutes"].sum().unstack(fill_value=0) / 60
)

# Reorder columns by month
monthly_yearly_watch_time = monthly_yearly_watch_time[month_order]

plt.figure(figsize=(14, 8))
sns.heatmap(
    monthly_yearly_watch_time, cmap="YlGnBu", annot=True, fmt=".0f", linewidths=0.5
)
plt.title("Total Netflix watching time (hours) by year and month")
plt.xlabel("Month")
plt.ylabel("Year")
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x="year")
plt.title("Netflix Viewing Sessions per year")
plt.xlabel("Year")
plt.ylabel("Number of Sessions")
plt.show()

In [None]:
if "hour_of_day" in df.columns and "Profile Name" in df.columns:

    plt.figure(figsize=(12, 7))

    custom_palette = sns.color_palette(
        "Paired", n_colors=len(df["Profile Name"].unique())
    )

    sns.countplot(
        data=df,
        x="hour_of_day",
        hue="Profile Name",
        order=range(24),
        dodge=False,
        palette=custom_palette,
    )

    plt.title("Netflix Viewing Sessions by Hour of Day (by Profile)")
    plt.xlabel("Hour of Day (UTC)")
    plt.ylabel("Number of Sessions")
    plt.xticks(range(0, 24))  # Ensure all hours are shown on the x-axis
    plt.legend(title="Profile Name")
    plt.grid(axis="y", linestyle="--", alpha=0.7)  # Add a grid for better readability
    plt.tight_layout()  # Adjust layout to prevent labels from overlapping
    plt.show()

else:
    print("Error: 'hour_of_day' or 'Profile Name' column not found in the DataFrame.")
    print("Please make sure you have run the previous code to create these columns.")

In [None]:
def format_minutes_to_hh_mm(total_minutes):
    """Converts total minutes into a string format of HH:MM."""
    if pd.isna(total_minutes):  # Handle potential NaN values
        return "N/A"
    total_minutes = int(total_minutes)  # Ensure it's an integer for calculation
    hours = total_minutes // 60
    minutes = total_minutes % 60
    return f"{hours:02d}:{minutes:02d}"


profile_title_watch_time = (
    df.groupby(["Profile Name", "Title"])["Duration_minutes"].sum().reset_index()
)

most_watched_indices = profile_title_watch_time.loc[
    profile_title_watch_time.groupby("Profile Name")["Duration_minutes"].idxmax()
]

print("The longest (re)watched movie for each user profile:")
print("-" * 60)

# Iterate through the results and print the most watched title and time for each profile
for index, row in most_watched_indices.iterrows():
    profile = row["Profile Name"]
    title = row["Title"]
    duration = format_minutes_to_hh_mm(row["Duration_minutes"])
    print(f"Profile: {profile}")
    print(f"  Most watched: {title}")
    print(f"  Total time: {duration} hours)")
    print("-" * 60)

In [None]:
def parse_duration(duration_str):
    if pd.isna(duration_str):
        return 0
    try:
        parts = list(map(int, duration_str.split(":")))
        if len(parts) == 3:
            return parts[0] * 3600 + parts[1] * 60 + parts[2]
        elif len(parts) == 2:
            return parts[0] * 60 + parts[1]
        elif len(parts) == 1:
            return parts[0]
    except:
        return 0
    return 0


if "Duration" in df.columns and "Duration_minutes" not in df.columns:
    df["Duration_seconds"] = df["Duration"].apply(parse_duration)
    df["Duration_minutes"] = df["Duration_seconds"] / 60

if "Main_Show_Title" not in df.columns or "Content Type" not in df.columns:

    def get_show_title(title):
        if isinstance(title, str) and ":" in title:
            return title.split(":", 1)[0].strip()
        else:
            return np.nan

    df["Main_Show_Title"] = df["Title"].apply(get_show_title)
    df["Content Type"] = "Movie"
    df.loc[df["Main_Show_Title"].notna(), "Content Type"] = "TV Show"


# --- Calculate and Sort Total Time Spent on Each TV Series for Every User ---
tv_shows_only_df = df[df["Content Type"] == "TV Show"].copy()

if tv_shows_only_df.empty:
    print("\nNo TV shows found in the data based on the current identification logic.")
else:
    profile_show_watch_time = (
        tv_shows_only_df.groupby(["Profile Name", "Main_Show_Title"])[
            "Duration_minutes"
        ]
        .sum()
        .reset_index()
    )
    profile_show_watch_time_sorted = profile_show_watch_time.sort_values(
        by=["Profile Name", "Duration_minutes"], ascending=[True, False]
    )

    # --- Display Top 5 Most Watched TV Shows for Each User ---
    print("\nTop 5 Most Watched TV Shows for Each User Profile (by total time):")
    print("=" * 80)

    for profile in profile_show_watch_time_sorted["Profile Name"].unique():
        print(f"Profile: {profile}")
        print("-" * (len(profile) + 9))

        # Filter for the current profile and get the top 5 rows
        top_5_shows_for_profile = profile_show_watch_time_sorted[
            profile_show_watch_time_sorted["Profile Name"] == profile
        ].head(5)

        if top_5_shows_for_profile.empty:
            print("  No TV show watching data for this profile.")
        else:
            # Print the top 5 shows and their watch times for this profile
            for index, row in top_5_shows_for_profile.iterrows():
                show_title = row["Main_Show_Title"]
                duration = format_minutes_to_hh_mm(row["Duration_minutes"])
                print(f"  - {show_title}: {duration} hours")

        print("=" * 80)