# Importar Librerias

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# cargar Dataset

In [2]:
data = pd.read_csv('/kaggle/input/imdb-top-250-movies/imdb_top_movies.csv')

# Analizar Datos

In [None]:
data

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data['Rank'].value_counts()

In [None]:
data['Title'].value_counts()

In [None]:
data['Year'].value_counts()

In [None]:
data['Rating'].value_counts()

In [None]:
data['Rating'].max()

In [None]:
data[data['Rating'] == data['Rating'].max()]

In [None]:
data['Rating'].min()

In [None]:
data[data['Rating'] == data['Rating'].min()]

In [None]:
data['Rating'].mean()

In [None]:
data['Rating'].median()

In [None]:
data['Duration'].max()

In [None]:
data[data['Duration'] == data['Duration'].max()]

In [None]:
data['Duration'].min()

In [None]:
data[data['Duration'] == data['Duration'].min()]

In [None]:
def duration_to_minutes(duration_str):
    hours = 0
    minutes = 0
    if 'h' in duration_str:
        parts = duration_str.split('h')
        hours = int(parts[0].strip())
        if 'm' in parts[1]:
            minutes_part = parts[1].split('m')
            minutes = int(minutes_part[0].strip())
    elif 'm' in duration_str:
        minutes_part = duration_str.split('m')
        minutes = int(minutes_part[0].strip())
    return hours * 60 + minutes

data['Duration_minutes'] = data['Duration'].apply(duration_to_minutes)
data['Duration_minutes']

In [None]:
duration_stats = data['Duration_minutes'].describe()
duration_stats

In [None]:
data['Certificate'].value_counts()

In [None]:
data['Genres'].value_counts()

In [None]:
data['Movie URL'].isnull().sum()

In [None]:
data['Image URL'].isnull().sum()

# Groupby

In [None]:
yearly_ratings = data.groupby('Year')['Rating'].mean()
yearly_ratings

In [None]:
rating_counts = data.groupby('Rating').size().reset_index(name='Count')
rating_counts

In [None]:
yearly_counts = data.groupby('Year').size().reset_index(name='Count')
yearly_counts

In [None]:
yearly_rating_counts = data.groupby(['Year', 'Rating']).size().reset_index(name='Count')
yearly_rating_counts

In [None]:
yearly_certificate_counts = data.groupby(['Year', 'Certificate']).size().reset_index(name='Count')
yearly_certificate_counts

In [None]:
genre_yearly_avg_rating = data.groupby(['Genres', 'Year'])['Rating'].mean().reset_index()
genre_yearly_avg_rating

# Tabla din√°mica

In [None]:
pivot_table_avg_rating = data.pivot_table(values='Rating', index='Year', aggfunc='mean')
pivot_table_avg_rating

In [None]:
pivot_table_yearly_counts = data.pivot_table(values='Title', index='Year', aggfunc='count')
pivot_table_yearly_counts

In [None]:
pivot_table_rating_stats = data.pivot_table(values='Rating', index='Year', aggfunc=['min', 'max', 'count'])
pivot_table_rating_stats

In [None]:
pivot_table_certificate_year = data.pivot_table(values='Title', index='Certificate', columns='Year', aggfunc='count')
pivot_table_certificate_year = pivot_table_certificate_year.fillna(0)
pivot_table_certificate_year

In [None]:
pivot_table_avg_duration_genre = data.pivot_table(values='Duration_minutes', index='Genres', aggfunc='mean')
pivot_table_avg_duration_genre

In [None]:
pivot_table_max_rating_cert_genre = data.pivot_table(values='Rating', index='Certificate', columns='Genres', aggfunc='max')
pivot_table_max_rating_cert_genre = pivot_table_max_rating_cert_genre.fillna(0)
display(pivot_table_max_rating_cert_genre)

# Tabla cruzada


In [None]:
crosstab_year_rating = pd.crosstab(data['Year'], data['Rating'])
crosstab_year_rating

In [None]:
crosstab_rating_year = pd.crosstab(data['Rating'], data['Year'])
crosstab_rating_year

In [None]:
crosstab_year_rating_norm = pd.crosstab(data['Year'], data['Rating'], normalize=True)
crosstab_year_rating_norm

In [None]:
crosstab_year_rating = pd.crosstab(data['Year'], data['Rating'])
crosstab_year_rating

In [None]:
crosstab_certificate_year = pd.crosstab(data['Certificate'], data['Year'])
crosstab_certificate_year

In [None]:
crosstab_genres_certificate = pd.crosstab(data['Genres'], data['Certificate'])
crosstab_genres_certificate

In [None]:
crosstab_certificate_year_norm = pd.crosstab(data['Certificate'], data['Year'], normalize=True)
crosstab_certificate_year_norm

# Visualizacion de Datos

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Rating', y='Count', data=rating_counts)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
genre_counts = data['Genres'].value_counts().nlargest(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=genre_counts.index, y=genre_counts.values)
plt.xticks(rotation=45, ha='right')
plt.title('Top 10 Movie Genre Distribution')
plt.xlabel('Genre')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
certificate_counts = data['Certificate'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=certificate_counts.index, y=certificate_counts.values)
plt.xticks(rotation=45)
plt.title('Distribution of Movie Certificates')
plt.xlabel('Certificate')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
plt.figure(figsize=(20, 6))
sns.barplot(x='Year', y='Count', data=yearly_counts)
plt.xticks(rotation=45)
plt.title('Number of Movies Released Per Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
plt.figure(figsize=(20, 6))
sns.barplot(x=yearly_ratings.index, y=yearly_ratings.values)
plt.xticks(rotation=45)
plt.title('Average Movie Rating Over Time')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='Rating', bins=10, kde=True)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='Duration_minutes', bins=20, kde=True)
plt.title('Distribution of Movie Durations (in minutes)')
plt.xlabel('Duration (minutes)')
plt.ylabel('Frequency')
plt.show()

In [None]:
top_5_ratings = rating_counts.nlargest(5, 'Count')

plt.figure(figsize=(10, 6))
plt.pie(top_5_ratings['Count'], labels=top_5_ratings['Rating'], autopct='%1.1f%%')
plt.title('Top 5 Movie Rating Distribution')
plt.show()

In [None]:
genre_counts = data['Genres'].value_counts().nlargest(10)

plt.figure(figsize=(12, 8))
plt.pie(genre_counts, labels=genre_counts.index, autopct='%1.1f%%')
plt.title('Top 10 Movie Genre Distribution')
plt.show()

In [None]:
certificate_counts = data['Certificate'].value_counts()

plt.figure(figsize=(10, 6))
plt.pie(certificate_counts, labels=certificate_counts.index, autopct='%1.1f%%')
plt.title('Distribution of Movie Certificates')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=yearly_counts['Year'], y=yearly_counts['Count'])
plt.title('Number of Movies Released Per Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x=yearly_ratings.index, y=yearly_ratings.values)
plt.title('Average Movie Rating Over Time')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Year', y='Rating', data=data)
plt.title('Relationship between Year and Rating')
plt.xlabel('Year')
plt.ylabel('Rating')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Year', y='Duration_minutes', data=data)
plt.title('Relationship between Year and Movie Duration')
plt.xlabel('Year')
plt.ylabel('Duration (minutes)')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Rating', y='Duration_minutes', data=data)
plt.title('Relationship between Rating and Movie Duration')
plt.xlabel('Rating')
plt.ylabel('Duration (minutes)')
plt.show()