In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.express as px
import os

In [None]:
data_path="/Users/jaideepsai/Desktop/DATA-ANALYTICS/Data Source/extracted-data/Entertainment/flixpatrol.csv"
dataset=pd.read_csv(data_path, delimiter=',')

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset['Genre'].unique()

In [None]:
dataset.isnull().sum()

In [None]:
dataset['Genre'].fillna("Unknown", inplace=True)

In [None]:
dataset.dropna(subset=['Premiere'], inplace=True)

In [None]:
dataset.isnull().sum()

In [None]:
dataset['Watchtime'] = pd.to_numeric(dataset['Watchtime'].str.replace(',', ''), errors='coerce')

In [None]:
dataset.head()

In [None]:
dataset.dtypes

In [None]:
def find_duplicates(df, column_name):
    duplicate_mask = df.duplicated(subset=[column_name], keep=False)
    duplicate_rows_df = df[duplicate_mask].sort_values(by=column_name)
    num_duplicates = duplicate_rows_df.shape[0]
    return num_duplicates > 0, num_duplicates, duplicate_rows_df

In [None]:
has_duplicates, num_duplicates, duplicate_rows_df = find_duplicates(dataset, 'Title')

In [None]:
print(has_duplicates)
print(num_duplicates)
duplicate_rows_df.head()

In [None]:
tv_shows_df = dataset[dataset['Type'] == 'TV Show']
movies_df = dataset[dataset['Type'] == 'Movie']

# Count the number of TV shows and movies
tv_shows_count = len(tv_shows_df)
movies_count = len(movies_df)

# Create a bar plot
plt.figure(figsize=(8, 6))
plt.bar(['TV Shows', 'Movies'], [tv_shows_count, movies_count], color=['skyblue', 'salmon'])
plt.xlabel('Type')
plt.ylabel('Count')
plt.title('Number of TV Shows vs Movies Over Years')
plt.show()

In [None]:
dataset['Type'].unique()

In [None]:
sorted_shows = dataset[dataset['Type'] == 'TV Show'].sort_values(by='Watchtime', ascending=False)
top_10_shows = sorted_shows.head(5)
fig = px.bar(top_10_shows, x='Watchtime', y='Title', orientation='h', 
             title='Top 5 TV Shows by Watchtime', 
             labels={'Watchtime': 'Watchtime', 'Title': 'TV Show Title'},
             color='Watchtime',
             color_continuous_scale='blues')
fig.update_layout(width=800, height=600, yaxis={'categoryorder':'total ascending'})
fig.show()

In [None]:
sorted_movies = dataset[dataset['Type'] == 'Movie'].sort_values(by='Watchtime', ascending=False)

# Select top 10 shows
top_10_movies = sorted_movies.head(5)

# Create an interactive bar plot using Plotly Express
fig = px.bar(top_10_movies, x='Watchtime', y='Title', orientation='h', 
             title='Top 5 Movies by Watchtime', 
             labels={'Watchtime': 'Watchtime', 'Title': 'Movie Title'},
             color='Watchtime',
             color_continuous_scale='balance')

# Update layout for better visualization
fig.update_layout(width=800, height=600, yaxis={'categoryorder':'total ascending'})

# Show the plot
fig.show()

In [None]:
premiere_counts = dataset['Premiere'].value_counts().reset_index()
premiere_counts.columns = ['Year', 'Count']

# Sort by year
premiere_counts = premiere_counts.sort_values(by='Year')

# Create an interactive bar plot using Plotly Express
fig = px.bar(premiere_counts, x='Year', y='Count', 
             title='Occurrences of Premieres by Year',
             labels={'Year': 'Premiere Year', 'Count': 'Occurrences'},
             color='Count',
             color_continuous_scale='amp')

# Update layout for better visualization
fig.update_layout(width=800, height=600)

# Show the plot
fig.show()

In [None]:
genre_counts = dataset['Genre'].value_counts().reset_index()
genre_counts.columns = ['Genre', 'Count']

# Sort by Genre
genre_counts = genre_counts.sort_values(by='Count')

# Create an interactive bar plot using Plotly Express
fig = px.bar(genre_counts, x='Genre', y='Count', 
             title='Occurrences of Genre',
             labels={'Genre': 'Genre', 'Count': 'Occurrences'},
             color='Count',
             color_continuous_scale='blues')

# Update layout for better visualization
fig.update_layout(width=800, height=600)

# Show the plot
fig.show()

In [None]:
df_2019 = dataset[dataset['Premiere'] == 2019]

# Display the first few rows of the subset DataFrame
df_2019.head()

In [None]:
genre_counts_2019 = df_2019['Genre'].value_counts().reset_index()
genre_counts_2019.columns = ['Genre', 'Count']

# Create a sunburst chart using Plotly Express
fig = px.sunburst(genre_counts_2019, path=['Genre'], values='Count',
                  title='Occurrences of Genres in 2019')

# Update layout for better visualization
fig.update_layout(width=800, height=600)

# Show the plot
fig.show()

In [None]:
# Calculate total count of each genre in the whole dataset
total_genre_counts = dataset['Genre'].value_counts()

# Calculate total count of each genre in the year 2019
genre_counts_2019 = df_2019['Genre'].value_counts()

# Calculate percentage for each genre in 2019 relative to the whole dataset
genre_percentage_2019 = (genre_counts_2019 / total_genre_counts) * 100

# Convert the calculated percentages to a DataFrame for visualization
genre_percentage_df = genre_percentage_2019.reset_index()
genre_percentage_df.columns = ['Genre', 'Percentage']

# Create an interactive pie chart using Plotly Express
fig = px.pie(genre_percentage_df, values='Percentage', names='Genre', 
             title='Percentage of Genre Counts in 2019 Relative to Whole Dataset')

# Update layout for better visualization
fig.update_layout(width=800, height=600)

# Show the plot
fig.show()

In [None]:
# Filter DataFrame for year 2019 and separate TV shows and movies
tv_shows_2019 = df_2019[df_2019['Type'] == 'TV Show']
movies_2019 = df_2019[df_2019['Type'] == 'Movie']

# Count the number of TV shows and movies in 2019
tv_shows_count = len(tv_shows_2019)
movies_count = len(movies_2019)

# Create a bar plot
plt.figure(figsize=(8, 6))
plt.bar(['TV Shows', 'Movies'], [tv_shows_count, movies_count], color=['skyblue', 'salmon'])
plt.xlabel('Type')
plt.ylabel('Count')
plt.title('Number of TV Shows vs Movies in 2019')
plt.show()

In [None]:
# Calculate total count of each genre in the whole dataset
total_genre_counts = dataset['Genre'].value_counts()

# Calculate total count of each genre in the year 2019
genre_counts_2019 = df_2019['Genre'].value_counts()

# Calculate percentage for each genre in 2019 relative to the whole dataset
genre_percentage_2019 = (genre_counts_2019 / total_genre_counts) * 100

# Convert the calculated percentages to a DataFrame for visualization
genre_percentage_df = genre_percentage_2019.reset_index()
genre_percentage_df.columns = ['Genre', 'Percentage']

# Create an interactive pie chart using Plotly Express
fig = px.pie(genre_percentage_df, values='Percentage', names='Genre', 
             title='Percentage of Genre Counts in 2019 Relative to Whole Dataset')

# Update layout for better visualization
fig.update_layout(width=800, height=600)

# Show the plot
fig.show()

In [None]:
trends_over_time = dataset.groupby('Premiere').agg({'Watchtime': 'sum', 'Title': 'count'}).rename(columns={'Title': 'Count', 'Watchtime': 'Total Watchtime'}).reset_index()

# Re-plotting trends over time for total watchtime without text labels
plt.figure(figsize=(14, 7), dpi=200)
sns.lineplot(x='Premiere', y='Total Watchtime', data=trends_over_time, marker='o', color='teal', label='Total Watchtime')
plt.title('Trends Over Time: Total Watchtime')
plt.xlabel('Year')
plt.ylabel('Total Watchtime')
plt.grid(True)
plt.tight_layout()
plt.show()

# Re-plotting trends over time for number of titles without text labels
plt.figure(figsize=(14, 7),dpi=200)
sns.lineplot(x='Premiere', y='Count', data=trends_over_time, marker='s', color='purple', label='Number of Titles')
plt.title('Trends Over Time: Number of Titles')
plt.xlabel('Year')
plt.ylabel('Number of Titles')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
genre_popularity = dataset.groupby('Genre').agg({'Watchtime': 'sum', 'Title': 'count'}).rename(columns={'Title': 'Count', 'Watchtime': 'Total Watchtime'}).reset_index()

genre_yearly_data = dataset.groupby(['Genre', 'Premiere']).agg({'Watchtime': 'sum', 'Title': 'count'}).reset_index()
genre_popularity_sorted = genre_popularity.sort_values(by='Total Watchtime', ascending=False)
# Setting up the plot
plt.figure(figsize=(14, 10), dpi=200)

# Choosing a subset of genres for clearer visualization
top_genres = genre_popularity_sorted.head(5)['Genre']  # Using the top 5 genres based on total watchtime for simplicity

for genre in top_genres:
    genre_data = genre_yearly_data[genre_yearly_data['Genre'] == genre]
    plt.plot(genre_data['Premiere'], genre_data['Watchtime'], marker='o', label=genre)

plt.title('Watchtime Trends by Genre Over Time')
plt.xlabel('Year')
plt.ylabel('Total Watchtime')
plt.legend()
plt.grid(True)
plt.tight_layout()

plt.show()

plt.figure(figsize=(14, 10), dpi=200)

for genre in top_genres:
    genre_data = genre_yearly_data[genre_yearly_data['Genre'] == genre]
    plt.plot(genre_data['Premiere'], genre_data['Title'], marker='s', label=genre)

plt.title('Number of Titles by Genre Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Titles')
plt.legend()
plt.grid(True)
plt.tight_layout()

plt.show()