In [None]:
import pandas as pd
import altair as alt

In [None]:
# read data and disable row limitations
df = pd.read_csv("netflix_titles.csv")
alt.data_transformers.disable_max_rows()
df

In [None]:
### simple visualization

# creating viz
alt.Chart(df).mark_bar().encode(
    x = alt.X("count()", axis=alt.Axis(title="Number of Titles")),
    y = alt.Y("type", axis=alt.Axis(title="Type"))
# setting viz title
).properties(
    title = "Number of Titles on Netflix by Types",
    width = 600,
    height = 300
).configure_bar(
    color = "#E50914",
    height = 50
).configure_axis(
    labelFontSize = 15,
    titleFontSize = 20,
    labelAngle = 0
).configure_title(
    fontSize = 20
)

In [None]:
### visualization 1

# signle out Movie titles and their durations
df1 = df[df["type"] == "Movie"][["title", "duration"]].dropna()

# cleaning data by only keeping length information in number of minutes
df1["duration"] = df1["duration"].apply(lambda x : int(x[:-4]))

# creating viz
viz1 = alt.Chart(df1).mark_bar().encode(
    x = alt.X("duration", axis=alt.Axis(title="Duration of the Movie")),
    y = alt.Y("count()", axis=alt.Axis(title="Number of Movies"))
# setting viz title
).properties(
    title = "Number of Movies on Netflix by Duration",
    width = 600,
    height = 400
).configure_bar(
    color = "#E50914"
).configure_axis(
    labelFontSize = 15,
    titleFontSize = 20,
    labelAngle = 0
).configure_title(
    fontSize = 20
)

In [None]:
### visualization 2

# signle out Movie titles and their date added
df2 = df[df["type"] == "Movie"][["title", "date_added"]].dropna()

# cleaning data by only keeping year information as integer
df2["date_added"] = df2["date_added"].apply(lambda x : int(x[-4:]))
# remove 2020 as its only Janurary
df2 = df2[df2["date_added"] < 2020]
# counts the added title in each year and sort the dataframe by year
df2 = pd.DataFrame(df2["date_added"].value_counts().reset_index().values, columns = ["Year", "Counts"])
df2 = df2.set_index("Year").sort_index(axis = 0, ascending = True)
# calculate the rate of increase
df2["Rate"] = df2["Counts"].diff()
df2 = df2.reset_index().drop(0).drop("Counts", axis = 1)
# renaming the years to intervals
df2["Year"] = df2["Year"].apply(lambda x : str(x - 1)[-2:] + "-" + str(x)[-2:])

# creating viz
viz2 = alt.Chart(df2).mark_line().encode(
    x = alt.X("Year", axis=alt.Axis(title="Year")),
    y = alt.Y("Rate", axis=alt.Axis(title="Rate of Increase"))
# setting viz title and size
).properties(
    title = "Rate of Increase of Netflix Title Adds with Time",
    width = 600,
    height = 400
).configure_line(
    color = "#E50914"
).configure_axis(
    labelFontSize = 15,
    titleFontSize = 20,
    labelAngle = 0
).configure_title(
    fontSize = 20
)

# Visulization 1: The distribution of movie lengths appears gaussian.
This visualization shows that The distribution of movie lengths appears gaussian. We can see the signature bell-shaped curve fro the histrogram dispite some outliers. I chose the histogram for this visualization because it is commonly used to demonstrate the distribution of data.

X axis is is the duration of the movie, while the Y axis is the number of movies with that specific duration. Netflix's red is used for the bar for thematic purposes.

In [None]:
viz1

# Visulization 2: The Trend of Title Adds on Netflix Since 2008
This visualization shows that The rate at which Netflix adds Titles to its platform stayed relativly flat from 2008 to 2013, saw significant increase from 2013 to 2017, and is graduly slowing down since. The data was cleaned and the rate of increase was calculated from calculating the derivatives. Originally, I plotted the graph using the number of titles added at each year. This was alter changed because it did not generate a intuitive visulization, leading to increased processing time, and possible false conclusions. The data from 2020 was removed because the year has not passed in full. I chose a line chart for this visualization because it is commonly used to showcase trend.
X axis is the year, while the Y axis is rate of increase of title adds. Netflix's red is again used for the line.

In [None]:
viz2