In [None]:
%pip install squarify
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib
import squarify 
from mpl_toolkits.mplot3d import Axes3D


In [None]:
matplotlib.use('TkAgg')

df = pd.read_csv('NetflixMovies_dataset.csv')

print(df.head())

print("Columns:", df.columns.tolist())

                      Title Name    Language Released Rating    IMDb
0                          "Sr."     English     2022      R     NaN
1                         #Alive      Korean     2020  TV-MA  6.2/10
2              #FriendButMarried  Indonesian     2018   TV-G  7.1/10
3            #FriendButMarried 2  Indonesian     2020   TV-G     NaN
4  #OOTD: Outfit of the Designer  Indonesian     2024  TV-14     NaN
Columns: ['Title Name', 'Language', 'Released', 'Rating', 'IMDb']


In [None]:
df['IMDb'] = df['IMDb'].astype(str).str.extract(r'([\d,\.]+)')[0].str.replace(',', '.')
df['IMDb'] = pd.to_numeric(df['IMDb'], errors='coerce')

df_valid = df.dropna(subset=['IMDb'])

top10_imdb = df_valid.sort_values(by='IMDb', ascending=False).head(10)

plt.figure(figsize=(12,6))
sns.barplot(x='IMDb', y='Title Name', data=top10_imdb, palette='viridis')
plt.title('Top 10 Movies/TV Shows by IMDb Rating')
plt.xlabel('IMDb Rating')
plt.ylabel('Title')
plt.tight_layout()
plt.show()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='IMDb', y='Title Name', data=top10_imdb, palette='viridis')


In [None]:
# Distribution of Movies/TV Shows by Language
language_counts = df['Language'].value_counts().head(10)  
plt.figure(figsize=(12,6))
sns.barplot(x=language_counts.index, y=language_counts.values, palette='viridis')
plt.title('Top 10 Languages of Movies/TV Shows on Netflix')
plt.xlabel('Language')
plt.ylabel('Number of Movies/TV Shows')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=language_counts.index, y=language_counts.values, palette='viridis')


In [None]:
#/////////////////////POSTTTTTTT////////////////////////////////////////////
# Distribution of Ratings
plt.figure(figsize=(12,6))
sns.histplot(df['Rating'], bins=10, kde=True, color='purple')
plt.title('Distribution of Ratings for Movies/TV Shows')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


In [None]:
#/////////////////////POSTTTTTTT////////////////////////////////////////////
# Trend of Movies/TV Shows Released Over the Years
df['Released'] = pd.to_datetime(df['Released'], errors='coerce')
df['Year'] = df['Released'].dt.year
yearly_releases = df.groupby('Year')['Title Name'].count().reset_index()
plt.figure(figsize=(12,6))
plt.plot(yearly_releases['Year'], yearly_releases['Title Name'], marker='o', color='blue')
plt.title('Trend of Movies/TV Shows Released Over the Years')
plt.xlabel('Year')
plt.ylabel('Number of Releases')
plt.xticks(yearly_releases['Year'], rotation=45)
plt.tight_layout()
plt.show()


In [50]:
rating_counts = df['Rating'].value_counts()
plt.figure(figsize=(12,6))
sns.barplot(x=rating_counts.index, y=rating_counts.values, palette='viridis')
plt.title('Distribution of Movies/TV Shows by Rating')
plt.xlabel('Rating')
plt.ylabel('Number of Titles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=rating_counts.index, y=rating_counts.values, palette='viridis')


In [None]:
language_counts = df['Language'].value_counts()

plt.figure(figsize=(10, 8))
plt.pie(language_counts, 
        labels=language_counts.index, 
        autopct='%1.1f%%', 
        startangle=140, 
        colors=sns.color_palette("Set2", len(language_counts)))
plt.title('Distribution of Titles by Language')
plt.axis('equal')  
plt.show()


In [None]:
#/////////////////////POSTTTTTTT////////////////////////////////////////////

df['Released'] = pd.to_datetime(df['Released'], errors='coerce')
df['Year'] = df['Released'].dt.year

titles_per_year = df.groupby('Year')['Title Name'].count()
avg_imdb_per_year = df.groupby('Year')['IMDb'].mean()

fig, ax1 = plt.subplots(figsize=(12, 6))

ax1.bar(titles_per_year.index, titles_per_year.values, color='skyblue', label='Number of Titles')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of Titles', color='skyblue')
ax1.tick_params(axis='y', labelcolor='skyblue')

ax2 = ax1.twinx()
ax2.plot(avg_imdb_per_year.index, avg_imdb_per_year.values, color='red',
         marker='o', linewidth=2, label='Average IMDb Rating')
ax2.set_ylabel('Average IMDb Rating', color='red')
ax2.tick_params(axis='y', labelcolor='red')

plt.title('Titles Released & Average IMDb Rating by Year')
fig.tight_layout()
plt.show()


In [None]:
df['IMDb'] = pd.to_numeric(df['IMDb'].astype(str).str.extract(r'(\d+\.?\d*)')[0], errors='coerce')
df['Released'] = pd.to_datetime(df['Released'], errors='coerce')
df['Year'] = df['Released'].dt.year


top_languages = df['Language'].value_counts().head(3).index.tolist()

plt.figure(figsize=(12, 6))
for lang in top_languages:
    lang_data = df[df['Language'] == lang]
    lang_yearly = lang_data.groupby('Year')['IMDb'].mean().dropna()
    plt.plot(lang_yearly.index, lang_yearly.values, marker='o', linewidth=2, label=lang)

plt.title('IMDb Rating Trend for Top 3 Languages')
plt.xlabel('Year')
plt.ylabel('Average IMDb Rating')
plt.legend(title='Language')
plt.tight_layout()
plt.show()


In [None]:
language_counts = df['Language'].value_counts()

# Create the treemap
plt.figure(figsize=(12,8))
squarify.plot(sizes=language_counts.values, 
              label=language_counts.index, 
              color=sns.color_palette("Set2", len(language_counts)),
              alpha=0.8)
plt.title("Treemap of Titles by Language", fontsize=16)
plt.axis('off')  # Hide the axes for a cleaner look
plt.show()


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
df['IMDb'] = pd.to_numeric(df['IMDb'].astype(str).str.extract(r'(\d+\.?\d*)')[0], errors='coerce')

lang_summary = df.groupby('Language').agg({'IMDb': 'mean', 'Title Name': 'count'}).reset_index()
lang_summary.rename(columns={'Title Name': 'Count'}, inplace=True)

# Create bubble chart
plt.figure(figsize=(12,8))
scatter = sns.scatterplot(data=lang_summary, 
                          x='IMDb', 
                          y='Count', 
                          size='Count', 
                          sizes=(100, 2000), 
                          alpha=0.7, 
                          hue='Language', 
                          legend=False)

for index, row in lang_summary.iterrows():
    plt.text(row['IMDb'] + 0.02, row['Count'], row['Language'], 
             horizontalalignment='left', size='medium', color='black', weight='semibold')

plt.title("Languages: Average IMDb Rating vs. Count of Titles", fontsize=16)
plt.xlabel("Average IMDb Rating")
plt.ylabel("Count of Titles")
plt.tight_layout()
plt.show()


In [None]:
#/////////////////////POSTTTTTTT////////////////////////////////////////////

df['IMDb'] = pd.to_numeric(df['IMDb'].astype(str).str.extract(r'(\d+\.?\d*)')[0], errors='coerce')

df['Released'] = pd.to_datetime(df['Released'], errors='coerce')
df['Year'] = df['Released'].dt.year

top_languages = df['Language'].value_counts().head(3).index.tolist()

lang_to_z = {lang: i for i, lang in enumerate(top_languages)}

fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, projection='3d')

for lang in top_languages:
    lang_data = df[df['Language'] == lang]
    yearly_avg = lang_data.groupby('Year')['IMDb'].mean().dropna()
    x = yearly_avg.index  
    y = yearly_avg.values 
    z = [lang_to_z[lang]] * len(yearly_avg) 
    ax.plot(x, y, z, marker='o', linewidth=2, label=lang)

ax.set_zticks(list(lang_to_z.values()))
ax.set_zticklabels(list(lang_to_z.keys()))

ax.set_xlabel('Viti')
ax.set_ylabel('Mesatarja e IMDb')
ax.set_zlabel('Gjuha')
plt.title("Trend 3D i Mesatares së IMDb sipas Gjuhës për Top 3 Gjuhët")

plt.legend(title='Gjuha')
plt.tight_layout()
plt.show()


  plt.tight_layout()
