In [None]:
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


 # Listing files to make sure our file is imported

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')


# Importing all libraries we're going to use, and reading the data 

In [None]:
df = pd.read_csv('/kaggle/input/disney/disney.csv')

Putting all the data in a variable called df

# Understanding the data (Shape , Values)

In [None]:
df.shape

In [None]:
df.head()

**The head() method in Pandas is used to display the first few rows of a DataFrame**

In [None]:
df.tail()

**The tail() method in Pandas is used to display the last few rows of a DataFrame**

# Understanding the types of our data 

In [None]:
df.dtypes

This method is made to make u discover the type of the data you're dealing with

# Cleaning the data 
*                      removing all columns that we'll not use

In [None]:
df = df[['show_id', 'type', 'title', 'director', 'cast',
       'date_added', 'duration', 'listed_in', 'description']]

In [None]:
df

# The data left that will be used 

 # 1_Top 5 des meilleures catégories.

In [None]:
df

* **Treating the row 'listed_in' as a string to separate each category by itself then adding it to a new row using the funtion explode()**

In [None]:
df['listed_in'] = df['listed_in'].astype(str)
df['listed_in'] = df['listed_in'].apply(lambda x: x.split(','))
df_cat = df.explode('listed_in')
print(df_cat.head())



* df['listed_in'] = df['listed_in'].astype(str): This line converts the 'listed_in' column to strings, ensuring consistent data type across all entries.

* df['listed_in'] = df['listed_in'].apply(lambda x: x.split(',')): Here, each entry in the 'listed_in' column is split by commas, creating a list of categories.
 
* df_cat = df.explode('listed_in'): This line "explodes" the DataFrame, meaning it duplicates rows where multiple categories exist in the 'listed_in' column, creating a new DataFrame df_cat where each row corresponds to a single category.

In [None]:
df_cat

* **Counting the values of each category alone using value_counts() then listing the top 5 categories using head(5)**

In [None]:
df_cat['listed_in'] = df_cat['listed_in'].str.strip()
top_categories = df_cat['listed_in'].value_counts().head(5)
print("Top 5 des meilleures catégories :")
print(top_categories)

* Removing leading and trailing whitespaces from the 'listed_in' column using str.strip().

* Counting the occurrences of each category using value_counts() and selecting the top 5 categories with .head(5).

* **Droping duplicates**

In [None]:
top_categories = top_categories.drop_duplicates()
print(top_categories.head(5))



* **Visualising the data using seaborn & matplotlib**

In [None]:
plt.figure(figsize=(8, 6))
plt.title('Top 5 Catégories')
sns.set_palette('bright')
plt.pie(top_categories, labels=top_categories.index, autopct='%1.1f%%', startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'})
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
top_categories.plot(kind='bar')
plt.title('Top 5 Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

This code generates a pie and bar chart displaying the proportion of each category's occurence. It's a concise and visually informative way to represent this data.

# 2_Top 5 des réalisateurs

* **Separating the directors**

In [None]:
df_filtered = df.dropna(subset=['director'])
df_filtered['director'] = df_filtered['director'].astype(str)
df_filtered['director'] = df_filtered['director'].apply(lambda x: x.split(','))
df_dir = df_filtered.explode('director')

* df_filtered = df.dropna(subset=['director']): This line removes rows where the 'director' column is null, creating a new DataFrame df_filtered.
* 
* df_filtered['director'] = df_filtered['director'].astype(str): Converts the 'director' column to strings, ensuring consistent data type.
* 
* df_filtered['director'] = df_filtered['director'].apply(lambda x: x.split(',')): Splits each entry in the 'director' column by commas, creating lists of directors.
* 
* df_dir = df_filtered.explode('director'): Explodes the DataFrame df_filtered based on the 'director' column, creating a new DataFrame df_dir where each row corresponds to a single director.

* **Counting the directors realisations**

In [None]:
df_dir['director'] = df_dir['director'].str.strip()
top_directors = df_dir['director'].value_counts().head(5)
print("Top 5 des meilleures réalisateurs :")
print(top_directors)

* Removing leading and trailing whitespaces from the 'director' column using str.strip().
* 
* Counting the occurrences of each director using value_counts() and selecting the top 5 directors with .head(5)

* **Visualisation**

In [None]:
plt.figure(figsize=(8, 6))
plt.title('Top 5 Réalisateurs')
sns.set_palette('bright')
plt.pie(top_directors, labels=top_directors.index, autopct='%1.1f%%', startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'})
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
top_directors.plot(kind='bar')
plt.title('Top 5 Réalisateurs')
plt.xlabel('Directors')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

This code generates a pie and bar chart displaying the proportion of each director's contribution among the top 5 directors. It's a concise and visually informative way to represent this data.

# 3_Les dix meilleurs acteurs

* **Separating actors**

In [None]:
df_filtered2 = df.dropna(subset=['cast'])
df_filtered2['cast'] = df_filtered2['cast'].astype(str)
df_filtered2['cast'] = df_filtered2['cast'].apply(lambda x: x.split(','))
df_act = df_filtered2.explode('cast')

* df_filtered2 = df.dropna(subset=['cast']): This line removes rows where the 'cast' column is null, creating a new DataFrame df_filtered2.
* 
* df_filtered2['cast'] = df_filtered2['cast'].astype(str): Converts the 'cast' column to strings, ensuring consistent data type.
* 
* df_filtered2['cast'] = df_filtered2['cast'].apply(lambda x: x.split(',')): Splits each entry in the 'cast' column by commas, creating lists of actors.
* 
* df_act = df_filtered2.explode('cast'): Explodes the DataFrame df_filtered2 based on the 'cast' column, creating a new DataFrame df_act where each row corresponds to a single actor.

* **Counting actors**

In [None]:
df_act['cast'] = df_act['cast'].str.strip()
top_actors = df_act['cast'].value_counts().head(10)
print("Top 10 des meilleures acteurs :")
print(top_actors)

* **Visualisation*** 

In [None]:
plt.figure(figsize=(8, 6))
plt.title('Top 10 acteurs')
sns.set_palette('bright')
plt.pie(top_actors, labels=top_actors.index, autopct='%1.1f%%', startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'})
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
top_actors.plot(kind='bar')
plt.title('Top 10 acteurs')
plt.xlabel('Acteurs')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

# 4_Les cinq meilleures séries télévisées avec le plus grand nombre de saisons.

* **gathering tv-shows only**

In [None]:
tv_shows = df[df["type"] == "TV Show"]
tv_shows

**Putting all the tv shows in a new variable called tv_shows**

* **Knowing their shape**

In [None]:
tv_shows.shape

* **398 tv-show**

* **Cleaning the data as wanted (removing "seasons" and converting numbers to int)**

In [None]:
season = tv_shows.groupby("title")["duration"].max().reset_index()
season["duration"] = season["duration"].str.replace(" Season", "")
season["duration"] = season["duration"].str.replace("s", "")
season["duration"] = season["duration"].astype(int)


* tv_shows.groupby("title")["duration"].max().reset_index(): This groups the TV shows by their titles and finds the maximum value of the "duration" column within each group. It then resets the index to convert the result into a DataFrame.

* season["duration"].str.replace(" Season", ""): This line removes the string " Season" from the values in the "duration" column.

* season["duration"].str.replace("s", ""): This line removes the letter "s" from the values in the "duration" column, which might be used to denote plural form.

* season["duration"].astype(int): This converts the values in the "duration" column to integers, ensuring they can be used for numerical operations.

* **Counting duration**

In [None]:
top_tv_shows= season.sort_values("duration", ascending=False).head(5)
top_tv_shows

This sorts the DataFrame season based on the values in the "duration" column in descending order.

In [None]:
plt.figure(figsize=(8, 6))
plt.title('5 longest tv-shows')
sns.set_palette('bright')
plt.pie(top_tv_shows["duration"], labels=top_tv_shows["title"], autopct='%1.1f%%', startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'})
plt.show()

# 6_Identifier des contenus similaires en faisant correspondre des caractéristiques textuelles

**Import itertools and SequenceMatcher from difflib**

In [None]:
import itertools
from difflib import SequenceMatcher

**Clean the data from null descriptions then compare the tv sh**

In [None]:
similarities = []

df = df.dropna(subset=["description"])
for (title1, desc1), (title2, desc2) in itertools.combinations(df[["title", "description"]].itertuples(index=False), 2):
    similarity_ratio = SequenceMatcher(None, desc1, desc2).ratio()
    if similarity_ratio > 0.60:
        similarities.append((title1, desc1, title2, desc2, similarity_ratio))

similarities_df = pd.DataFrame(similarities, columns=["Title 1", "Description 1", "Title 2", "Description 2", "Similarity Ratio"])

similarities_df

* itertools.combinations(df[["title", "description"]].itertuples(index=False), 2): This generates combinations of pairs of titles and descriptions using the itertools.combinations() function. It creates pairs of tuples containing the titles and descriptions of different titles from the DataFrame.

* SequenceMatcher(None, desc1, desc2).ratio(): This computes the similarity ratio between the descriptions of two titles using the SequenceMatcher class from the difflib module. The ratio ranges from 0 to 1, where 1 means the sequences are identical.

* if similarity_ratio > 0.80: This condition checks if the similarity ratio is greater than 0.80, indicating a significant similarity between the descriptions.

* (title1, desc1, title2, desc2, similarity_ratio): If the similarity ratio is above 0.80, this tuple containing the titles, descriptions, and similarity ratio is appended to the similarities list.

* similarities_df = pd.DataFrame(similarities, columns=["Title 1", "Description 1", "Title 2", "Description 2", "Similarity Ratio"]): Finally, this line creates a DataFrame similarities_df from the list of similarities, with appropriate column names.

# 7_Disney se concentre-t-il d'avantage sur les séries télévisées que sur les films ces dernières années ?

In [None]:
df['year_added'] = pd.to_datetime(df['date_added']).dt.year
df_type = df.groupby(['year_added', 'type']).size().unstack(fill_value=0)
df_type

* pd.to_datetime(df['date_added']).dt.year: This converts the 'date_added' column to datetime format and extracts the year component.

* groupby(['year_added', 'type']).size(): This groups the DataFrame by 'year_added' and 'type', and returns the size of each group.

* unstack(fill_value=0): This rearranges the grouped data, pivoting the 'type' index level into columns. Missing values are filled with 0 using fill_value=0.

* **Visualisation**

In [None]:
plt.figure(figsize=(10, 6))
df_type.plot(kind='bar', stacked=True)
plt.title("Nombre de séries télévisées et de films ajoutés par année")
plt.xlabel("Année")
plt.ylabel("Nombre de contenus ajoutés")
plt.xticks(rotation=45)
plt.legend(title='Type')
plt.tight_layout()
plt.show()

**After visualization, we notice that indeed Disney has been focusing more on movies in recent years rather than series.**