In [10]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
import numpy as np
import seaborn as sns

# Study data files
netflix_path = './netflix_titles.csv'

netflix_df = pd.read_csv(netflix_path)
netflix_df = netflix_df.dropna(axis=1)
netflix_sortbyyear_df = netflix_df.sort_values(by=['release_year'], ascending=False)
netflix_sortbyyear_df

df_2019 = netflix_sortbyyear_df.loc[netflix_sortbyyear_df['release_year'] == 2019]
df_2020 = netflix_sortbyyear_df.loc[netflix_sortbyyear_df['release_year'] == 2020]
df_2021 = netflix_sortbyyear_df.loc[netflix_sortbyyear_df['release_year'] == 2021]

country_2019 = pd.DataFrame(df_2019['country'].value_counts())
country_2020 = pd.DataFrame(df_2020['country'].value_counts())
country_2021 = pd.DataFrame(df_2021['country'].value_counts())

netflix_df.head()
pd.Timedelta(netflix_df.loc[9, 'duration'])
netflix_df

KeyError: 'country'

In [8]:
print(f"Number of Shows in 2019: {df_2019.loc[df_2019['country'] > 5].sum()[0]}")
print(f"Number of Shows in 2020: {df_2020.loc[df_2020['country'] > 5].sum()[0]}")
print(f"Number of Shows in 2021: {df_2021.loc[df_2021['country'] > 5].sum()[0]}")

fig1, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,4))

df1=(df_2019.loc[df_2019['country'] > 5])
ax1.bar(df1.index, df1['country'], color='black')
ax1.tick_params(labelrotation=90)
ax1.set_ylabel("Number of Shows", size=15)
ax1.set_title("2019")

df2=(df_2020.loc[df_2020['country'] > 5])
ax2.bar(df2.index, df2['country'], color='r')
ax2.tick_params(labelrotation=90)
ax2.set_xlabel("Country", size=15)
ax2.set_title("2020")

df3=(df_2021.loc[df_2021['country'] > 5])
ax3.bar(df3.index, df3['country'], color='b')
ax3.tick_params(labelrotation=90)
ax3.set_title("2021")

fig1.tight_layout()

plt.savefig("Country_Production.png")

KeyError: 'country'

In [None]:
fig2, (ax4, ax5, ax6) = plt.subplots(1, 3, figsize=(10,3))

rating_2019 = df_2019.groupby('rating').sum()/1047861*100
rating_2020 = df_2020.groupby('rating').sum()/892840*100
rating_2021 = df_2021.groupby('rating').sum()/325381*100

new_index = ['G', 'PG', 'PG-13', 'R', 'TV-14', 'TV-G', 'TV-MA', 'TV-PG', 'TV-Y', 'TV-Y7']

rating_2020 = pd.DataFrame(rating_2020['release_year'], index=new_index)
rating_2021 = pd.DataFrame(rating_2021['release_year'], index=new_index)
rating_2020 = rating_2020.fillna(0)
rating_2021 = rating_2021.fillna(0)

ax4.bar(rating_2019.index, rating_2019['release_year'], color='black')
ax4.set_ylim(0, 50)
ax4.set_ylabel("Percentage (%)", size=15)
ax4.tick_params(labelrotation=90, labelsize=12)
ax4.set_title("2019")

ax5.bar(rating_2020.index, rating_2020['release_year'], color='r')
ax5.set_xlabel("Movie Rating", size=15)
ax5.set_ylim(0, 50)
ax5.tick_params(labelrotation=90, labelsize=12)
ax5.set_title("2020")

ax6.bar(rating_2021.index, rating_2021['release_year'], color='b')
ax6.set_ylim(0, 50)
ax6.tick_params(labelrotation=90, labelsize=12)
ax6.set_title("2021")

fig2.tight_layout()
plt.savefig("Rating.png")
plt.show()



In [None]:
temp_df = pd.merge(rating_2019, rating_2020, left_index=True, right_index=True)
merged_df = pd.merge(temp_df, rating_2021, left_index=True, right_index=True)
merged_df = merged_df.rename(columns={'release_year_x': '2019',
                                       'release_year_y': '2020',
                                       'release_year': '2021'})

print("\t*Percentage of Rating from 2019 to 2021*\n")
print("\tRating\t 2019\t 2020\t 2021")
print("\t-------------------------------")
for index, row in merged_df.iterrows():
    print(f"\t{index}\t {round(row['2019'], 3)}\t {round(row['2020'], 3)}\t {round(row['2021'], 3)}")

In [None]:

targetCountries = [
    "United States",
    "United Kingdom",
    "Hong Kong",
    "China",
    "France",
    "India",
    "South Korea",
    "Thailand",
    "Australia",
    "Canada"
]


for country in targetCountries:
    
    temp_df = movie_df.loc[movie_df['country'] == country]
    
    groupby_df_movie = temp_df.loc[temp_df['type'] == 'Movie'].groupby('release_year').count()
    groupby_df_tv = temp_df.loc[temp_df['type'] == 'TV Show'].groupby('release_year').count()
    
    plt.scatter(groupby_df_movie.index, groupby_df_movie['show_id'])
    plt.scatter(groupby_df_tv.index, groupby_df_tv['show_id'])
    plt.xlabel("Year", size=15)
    plt.ylabel("Count", size=15)
    