# The Movie Database

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import utils.TmdbHelper as Tmdb

In [2]:
lang = pd.read_html('https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes')
lang_codes = lang[1]
lang_codes.drop(['639-2[1]', '639-3[2]', '639-5[3]', 'Scope', 'Type', 'Native name(s)',
                     'Other name(s)'], axis=1, inplace=True)
lang_codes.dropna(inplace=True)

In [3]:
movies_df = Tmdb.get_movies(start_page=1,end_page=100)

In [4]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1980 entries, 0 to 1979
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   adult                 1980 non-null   bool   
 1   backdrop_path         1968 non-null   object 
 2   genre_ids             1980 non-null   object 
 3   id                    1980 non-null   int64  
 4   original_language     1980 non-null   object 
 5   original_title        1980 non-null   object 
 6   overview              1980 non-null   object 
 7   popularity            1980 non-null   float64
 8   poster_path           1980 non-null   object 
 9   release_date          1980 non-null   object 
 10  title                 1980 non-null   object 
 11  video                 1980 non-null   bool   
 12  vote_average          1980 non-null   float64
 13  vote_count            1980 non-null   int64  
 14  budget                1980 non-null   int64  
 15  revenue              

In [5]:
movies_df.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,budget,revenue,production_companies,production_countries,runtime
0,False,/sR0SpCrXamlIkYMdfz83sFn5JS6.jpg,"[28, 878, 12]",823464,en,Godzilla x Kong: The New Empire,"Following their explosive showdown, Godzilla a...",3404.57,/4Hbf0Gw5PD2GC7PmiwjEwf0ROCU.jpg,2024-03-27,Godzilla x Kong: The New Empire,False,6.8,422,135000000,214305986,"[Legendary Pictures, Warner Bros. Pictures]",[United States of America],115
1,False,/1XDDXPXGiI8id7MrUxK36ke7gkX.jpg,"[28, 12, 16, 35, 10751]",1011985,en,Kung Fu Panda 4,Po is gearing up to become the spiritual leade...,2101.694,/kDp1vUBnMpe8ak4rjgl3cLELqjU.jpg,2024-03-02,Kung Fu Panda 4,False,6.773,598,85000000,347255055,[DreamWorks Animation],[United States of America],94
2,False,/oe7mWkvYhK4PLRNAVSvonzyUXNy.jpg,"[28, 53]",359410,en,Road House,Ex-UFC fighter Dalton takes a job as a bouncer...,1808.763,/bXi6IQiQDHD00JFio5ZSZOeRSBh.jpg,2024-03-08,Road House,False,7.138,1196,85000000,0,"[Metro-Goldwyn-Mayer, Silver Pictures]",[United States of America],121
3,False,/9c0lHTXRqDBxeOToVzRu0GArSne.jpg,"[878, 28]",935271,en,After the Pandemic,Set in a post-apocalyptic world where a global...,1244.584,/p1LbrdJ53dGfEhRopG71akfzOVu.jpg,2022-03-01,After the Pandemic,False,5.25,14,0,0,[Andromeda Motion Pictures],[United States of America],84
4,False,/pwGmXVKUgKN13psUjlhC9zBcq1o.jpg,"[28, 14]",634492,en,Madame Web,"Forced to confront revelations about her past,...",1146.356,/rULWuutDcN5NvtiZi4FRPzRYWSh.jpg,2024-02-14,Madame Web,False,5.659,971,80000000,99266032,"[Columbia Pictures, di Bonaventura Pictures]",[United States of America],116


In [6]:
movies_df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count,budget,revenue,runtime
count,1980.0,1980.0,1980.0,1980.0,1980.0,1980.0,1980.0
mean,344558.5,83.521373,6.727189,4689.687374,50257150.0,191380600.0,108.39899
std,364309.0,123.654724,1.187043,5340.699887,64663900.0,289769600.0,27.953137
min,11.0,24.771,0.0,0.0,0.0,0.0,0.0
25%,10398.0,46.803,6.248,615.75,0.0,169742.2,93.0
50%,254224.0,59.5895,6.86,3033.5,22000000.0,81583170.0,107.0
75%,609798.2,83.9735,7.427,6610.5,79000000.0,268800000.0,123.0
max,1262596.0,3404.57,10.0,35500.0,460000000.0,2923706000.0,242.0


## Preprocessing for EDA

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

In [8]:
class BaseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    @staticmethod
    def transform(X):
        X['original_language'] = X['original_language']\
            .transform(lambda x: Tmdb.lang_transform(x, lang_codes))
        X['genre'] = X['genre_ids'].transform(lambda x: Tmdb.genre_transform(x))
        X['revenue(mil)'] = X['revenue'].apply(lambda x: x // 1000000)
        X['budget(mil)'] = X['budget'].apply(lambda x: x // 1000000)
        X['release_date'] = pd.to_datetime(X['release_date'])
        X['month'] = X['release_date'].dt.strftime('%b')
        month_label = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        X['month'] = pd.Categorical(X['month'], categories=month_label, ordered=True)
        return X

In [9]:
class TypeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    @staticmethod
    def transform(X):
        X['revenue(mil)'] = X['revenue(mil)'].astype('float')
        X['budget(mil)'] = X['budget(mil)'].astype('float')
        X['runtime(min)'] = X['runtime'].astype('float')
        X['adult'] = X['adult'].astype('category')
        X['original_language'] = X['original_language'].astype('category')
        
        
        return X

In [10]:
class FeatureDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    @staticmethod
    def transform(X): 
        X = X.loc[(X['revenue(mil)'] > 0) & (X['budget(mil)'] > 0)]
        return X.drop(['backdrop_path','overview','poster_path','video','genre_ids', 'budget','revenue','runtime'], axis=1, errors="ignore")

In [11]:
from sklearn.pipeline import Pipeline

pipline = Pipeline([('transform', BaseTransformer()),
                    ('type', TypeTransformer()),
                    ('dropper', FeatureDropper())])

In [12]:
movies_df = pipline.fit_transform(movies_df)

TypeError: BaseTransformer.transform() takes 1 positional argument but 2 were given

## Exploratory Data Analysis

In [None]:
movies_df.info()

In [None]:
movies_df.describe()

In [None]:
group_year_sum = movies_df.groupby(movies_df.release_date.dt.year)[['revenue(mil)', 'budget(mil)']].sum().reset_index()
group_year_mean = movies_df.groupby(movies_df.release_date.dt.year)[['revenue(mil)', 'budget(mil)']].mean().reset_index()

In [None]:
group_year_mean

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(10, 10), sharex=True)

plt.style.use('ggplot')
fig.supxlabel('Year', fontsize=14)
ax1.set_title('Total Revenue and Budget per year', fontsize=16)

group_year_sum.loc[group_year_sum['release_date'] >= 2010].plot(x='release_date', kind='bar', ax=ax1)
ax2.set_title('Average Revenue and Budget per year', fontsize=16)

group_year_mean.loc[group_year_mean['release_date'] >= 2010].plot(x='release_date', kind='bar', ax=ax2)
plt.show()

### Total Revenue and Budget per Year

- This plot displays the total revenue and budget for movies released each year, starting from the year 2010.
- The x-axis represents the years, while the y-axis represents the total revenue and budget in millions of dollars.
- Each bar represents a year, with two bars side by side for each year, one representing the total revenue and the other representing the total budget.
- This plot provides an overview of the overall financial performance of movies released each year, allowing us to identify trends and fluctuations in revenue and budget over time.

### Average Revenue and Budget per Year

- This plot illustrates the average revenue and budget for movies released each year, starting from the year 2010.
- Similar to the previous plot, the x-axis represents the years, while the y-axis represents the average revenue and budget in millions of dollars.
- Each bar represents a year, with two bars side by side for each year, one representing the average revenue and the other representing the average budget.
- Unlike the previous plot, which shows the total financial figures, this plot focuses on the average financial performance per movie released each year.
- By examining the average revenue and budget trends over time, we can gain insights into the profitability and investment patterns in the movie industry.


In [None]:
group_month_sum = movies_df.groupby('month', observed=False)[['revenue(mil)', 'budget(mil)']].sum().reset_index()
group_month_mean = movies_df.groupby('month', observed=False)[['revenue(mil)', 'budget(mil)']].mean().reset_index()

In [None]:
group_month_mean

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(12, 10), sharex=True)

plt.style.use('ggplot')
fig.supxlabel('Year', fontsize=14)
ax1.set_title('Total Revenue and Budget per Month (mil)', fontsize=16)

group_month_sum.plot(x='month', kind='bar', ax=ax1)
ax2.set_title('Average Revenue and Budget per Month (mil)', fontsize=16)

group_month_mean.plot(x='month', kind='bar', ax=ax2)
plt.tight_layout()
plt.show()

### Total Revenue and Budget per Month

- This plot displays the total revenue and budget for movies released each month.
- The x-axis represents the months of the year, while the y-axis represents the total revenue and budget in millions of dollars.
- Each bar represents a month, with two bars side by side for each month, one representing the total revenue and the other representing the total budget.
- This plot provides an overview of the financial performance of movies released each month, allowing us to identify patterns and trends in revenue and budget allocation throughout the year.

### Average Revenue and Budget per Month

- This plot illustrates the average revenue and budget for movies released each month.
- Similar to the previous plot, the x-axis represents the months of the year, while the y-axis represents the average revenue and budget in millions of dollars.
- Each bar represents a month, with two bars side by side for each month, one representing the average revenue and the other representing the average budget.
- Unlike the previous plot, which shows the total financial figures, this plot focuses on the average financial performance per movie released each month.
- By examining the average revenue and budget trends over each month, we can gain insights into the seasonality and periodic patterns in movie revenues and budget allocations.

### Analysis of Seasonal Trends in Movie Releases

- The provided plot reveals interesting insights into the seasonal patterns of movie releases.
- There are noticeable spikes in movie releases around May, June, and July, which can be attributed to the abundance of holidays during the summer months.
- Another significant spike occurs around November and December, coinciding with the Christmas and year-end holidays.
- This observation suggests that production companies strategically time the release of movies to coincide with seasonal changes and holiday periods, aiming to capitalize on increased leisure time and consumer spending during these periods.
- By aligning movie releases with holidays and seasonal changes, production companies can maximize audience engagement and box office revenue.
- The plot below effectively illustrates the average release of movies per month across multiple years, providing valuable insights into the temporal distribution of movie releases and underlying industry strategies.


In [None]:
movies_df['month'] = movies_df['release_date'].dt.month
movies_df['year'] = movies_df['release_date'].dt.year


monthly_release_avg = movies_df.groupby(['year','month']).size().groupby(['month']).mean()

months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

plt.figure(figsize=(10, 6))
plt.plot(months, monthly_release_avg, marker='o', color='skyblue', linestyle='-')
plt.title('Average Movie Releases per Month Across Years')
plt.xlabel('Month')
plt.ylabel('Average Number of Releases')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
sns.heatmap(movies_df[['revenue(mil)', 'budget(mil)','vote_average','popularity', 'vote_count']].corr(), annot=True)

### Correlation Heatmap

- This heatmap illustrates the correlation matrix between various numerical features in the dataset.
- The heatmap color-codes the correlation coefficients, with warmer colors indicating stronger positive correlations and cooler colors indicating stronger negative correlations.
- The annotations on the heatmap represent the correlation coefficients, providing quantitative insights into the strength and direction of relationships between pairs of features.
- By examining the correlations between features such as revenue, budget, vote average, popularity, and vote count, we can identify potential relationships and dependencies within the dataset.
- Understanding these correlations can help inform further analysis and modeling decisions, such as feature selection and model interpretation.
- It shows a strong relationship between our target variable (revenue) with vote count and budget, although budget is expected vote count is expected to show a relationship, 
- it also reveals that the popularity of a movie is linked to its revenue or budget

In [None]:
popularity_over_time = movies_df.groupby('year')['popularity'].mean()

plt.figure(figsize=(12, 6))
plt.plot(popularity_over_time.index, popularity_over_time.values, color='skyblue')
plt.title('Popularity of Movies Over Time')
plt.xlabel('Release Date')
plt.ylabel('Popularity')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


### Analysis of Popularity Trend Over Time

- The plotted data illustrates the average popularity of movies over time, aggregated on a yearly basis.
- Upon examination, there is a noticeable spike in movie popularity around the year 2024.
- This spike suggests a significant increase in audience interest and engagement with movies released during that period.
- Possible factors contributing to this spike could include the release of highly anticipated movies, breakthrough performances, or innovative marketing strategies.
- The observed spike in movie popularity around 2024 may reflect shifting audience preferences, industry trends, or cultural influences during that time.
- Further analysis, including examining specific movie releases and external events occurring around 2024, could provide additional insights into the drivers behind this popularity surge.
- Overall, this visualization offers valuable insights into the temporal dynamics of movie popularity, highlighting notable trends and fluctuations over time.


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(movies_df['runtime(min)'], bins=15, color='skyblue', edgecolor='black', kde=True)
plt.title('Distribution of Movie Runtimes')
plt.xlabel('Runtime (minutes)')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.5)
plt.tight_layout()
plt.show()

### Analysis of Movie Runtime Distribution

- The histogram visualizes the distribution of movie runtimes, measured in minutes.
- Upon examination, the distribution appears to be unimodal, with a peak in frequency observed between 100 and 120 minutes.
- This indicates that a significant proportion of movies in the dataset have runtime durations falling within this range.
- Conversely, there is a notable decrease in frequency observed for movies with runtime durations exceeding 160 minutes, suggesting fewer movies have longer durations beyond this threshold.
- The histogram's shape and peaks provide insights into the typical runtime preferences within the dataset, with most movies falling within a certain duration range.
- Understanding the distribution of movie runtimes can inform various aspects of movie production and consumption, such as scheduling, audience preferences, and content pacing.
- Further analysis, such as exploring runtime trends across genres or release years, could provide additional insights into factors influencing movie runtime distribution.


In [None]:
genres = [genre for sublist in movies_df['genre'] for genre in sublist]

genre_counts = pd.Series(genres).value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=genre_counts.values, y=genre_counts.index, palette='viridis', hue=genre_counts.index, legend=False)
plt.title('Distribution of Movie Genres')
plt.xlabel('Frequency')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()


### Analysis of Movie Genre Distribution

- The bar plot illustrates the distribution of movie genres based on their frequency of occurrence in the dataset.
- Upon inspection, the most prevalent genre is 'Action', with the highest frequency of occurrence.
- Following 'Action', the next most common genres are 'Adventure' and 'Drama', which also exhibit relatively high frequencies.
- 'Comedy' is another prevalent genre, ranking among the top genres in terms of frequency.
- In contrast, 'Western' emerges as the least common genre, with the lowest frequency of occurrence in the dataset.
- 'Music' and 'History' genres also have relatively low frequencies compared to other genres.
- Understanding the distribution of movie genres can provide insights into audience preferences, industry trends, and market demand for different types of content.
- These insights can inform various aspects of movie production, marketing, and distribution strategies tailored to specific genre preferences and audience segments.
- with access to a more detailed database that shows the countries of release, one can understand the distribution of genres across multiple countries

In [None]:
genres = [genre for sublist in movies_df['genre'] for genre in sublist]

data = {'Genre': genres, 'Revenue (mil)': movies_df['revenue(mil)'].repeat(movies_df['genre'].str.len())}
genre_revenue_df = pd.DataFrame(data).sort_values(by='Revenue (mil)', ascending=False)

plt.figure(figsize=(12, 6))
sns.set_style('darkgrid')
sns.barplot(data=genre_revenue_df, x='Genre', y='Revenue (mil)', palette='viridis', hue='Genre', legend=False)
plt.title('Genre vs. Revenue')
plt.xlabel('Genre')
plt.ylabel('Revenue (millions)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### Analysis of Genre vs. Revenue

- The bar plot compares different movie genres based on their corresponding revenue generated, represented in millions of dollars.
- Upon inspection, 'Adventure' emerges as the genre with the highest revenue, indicating that movies categorized under this genre tend to generate significant financial returns.
- Following 'Adventure', 'Sci-Fi' and 'Family' genres also exhibit relatively high revenue figures, suggesting their popularity and profitability in the movie industry.
- 'Animation' and 'Fantasy' genres closely follow, further underscoring the commercial success of movies within these genres.
- Interestingly, although 'Action' was identified as one of the most prevalent genres, it does not rank highest in terms of revenue, highlighting a potential discrepancy between popularity and financial performance.
- This analysis sheds light on the varying revenue potentials associated with different movie genres, providing insights into audience preferences, market demand, and commercial viability within the entertainment industry.
- Understanding the relationship between genre and revenue can inform strategic decisions related to content production, marketing strategies, and investment opportunities in the movie sector.


In [None]:
production_companies = [genre for sublist in movies_df['production_companies'] for genre in sublist]

production_companies_counts = pd.Series(production_companies).value_counts()
top_10_pC = production_companies_counts.sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_10_pC.values, y=top_10_pC.index, palette='viridis', hue=top_10_pC.index, legend=False)
plt.title('Top 10 Production Companies by movies produced')
plt.xlabel('NO of Movies Produced')
plt.ylabel('Production Companies')
plt.tight_layout()
plt.show()

### Analysis of Top 10 Production Companies by Movies Produced

- The bar plot displays the top 10 production companies ranked by the number of movies they have produced.
- Upon examination, 'Warner Bros' emerges as the leading production company, with the highest number of movies produced among the top 10.
- Following 'Warner Bros', other prominent production companies include 'Columbia Pictures', 'Universal Pictures', 'Paramount Pictures', and '20th Century Fox', which also have significant numbers of movies in their portfolios.
- Notably, 'Walt Disney Pictures' also ranks among the top production companies, highlighting its substantial contribution to the movie industry.
- The ranking of production companies based on the number of movies produced provides insights into their production output and market influence within the industry.
- Understanding the dominance of certain production companies can inform strategic partnerships, distribution agreements, and investment decisions within the movie industry landscape.


In [None]:
production_companies = [company for sublist in movies_df['production_companies'] for company in sublist]

data = {'Production Company': production_companies, 'Revenue (mil)': movies_df['revenue(mil)'].repeat(movies_df['production_companies'].str.len()), 'Budget (mil)': movies_df['budget(mil)'].repeat(movies_df['production_companies'].str.len()), 'title': movies_df['title'].repeat(movies_df['production_companies'].str.len())}
production_df = pd.DataFrame(data)

production_agg = production_df.groupby('Production Company').agg({'Revenue (mil)': 'sum', 'Budget (mil)': 'sum', 'title':'count'}).reset_index().rename(columns={'title':'total movies'})
top_10_pC_by_movies = production_agg.sort_values(by='total movies', ascending=False).head(10)

plt.figure(figsize=(12, 6))


plt.bar(top_10_pC_by_movies['Production Company'], top_10_pC_by_movies['Revenue (mil)'], color='skyblue', label='Revenue')
plt.bar(top_10_pC_by_movies['Production Company'], top_10_pC_by_movies['Budget (mil)'], color='orange', label='Budget')

plt.title('Revenue and Budget of the top Production Company by movies released')
plt.xlabel('Production Company')
plt.ylabel('Amount (millions)')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()


### Analysis of Revenue and Budget of Top Production Companies by Movies Released

- The grouped bar plot compares the revenue and budget of the top production companies based on the number of movies they have released.
- The top production companies are determined by the total count of movies they have produced, highlighting their prominence in the industry.
- Each production company is represented by two bars: one indicating the total revenue generated by their movies, and the other representing the total budget allocated for their movie productions.
- Upon inspection, it is evident that certain top production companies, such as 'Warner Bros', 'Columbia Pictures', and 'Universal Pictures', exhibit substantial differences between their total revenue and budget figures.
- 'Warner Bros' stands out as the top production company in terms of both revenue and budget, indicating its significant financial success and investment capacity in the movie industry.
- This analysis offers insights into the financial performance and investment strategies of top production companies, providing valuable information for stakeholders in the movie production and distribution ecosystem.
- Understanding the revenue and budget dynamics of production companies can inform strategic decisions related to partnerships, investment opportunities, and resource allocation within the entertainment industry.


## Conclusion and Insights:

Through the exploratory data analysis (EDA) conducted on the movie dataset, several key insights have been unearthed, shedding light on various aspects of the movie industry. Here's a summary of the findings:

- **Seasonal Trends:** Analysis of movie release dates revealed distinct spikes in certain months, correlating with holidays and seasonal changes. This suggests that production companies strategically time their movie releases to capitalize on heightened audience engagement during these periods.

- **Popularity Over Time:** A notable spike in movie popularity around 2024 was observed, indicating a significant increase in audience interest during that period. Further investigation into external factors influencing this spike could provide valuable insights.

- **Movie Runtimes:** The distribution of movie runtimes highlighted a peak between 100 and 120 minutes, reflecting audience preferences for movies within this duration range.

- **Genre Preferences:** Analysis of movie genres revealed 'Action' as the most prevalent genre, followed by 'Adventure' and 'Drama'. However, the revenue analysis showed that 'Adventure' movies tend to generate the highest revenue, indicating a potential discrepancy between popularity and financial performance.

- **Production Companies:** Top production companies such as 'Warner Bros', 'Columbia Pictures', and 'Universal Pictures' were identified based on the number of movies produced. Further analysis revealed their revenue and budget dynamics, offering insights into their financial performance and investment strategies.


## Future Direction:

Building upon these insights, the next step would involve leveraging machine learning techniques to predict movie revenue. By analyzing various features such as genre, production budget, release date, and more, predictive models can be developed to forecast revenue potential for upcoming movies.

This predictive capability would not only assist production companies in making informed decisions regarding resource allocation and investment but also provide valuable insights into audience preferences and market trends.

In conclusion, the combination of exploratory data analysis and machine learning holds immense potential for empowering stakeholders in the movie industry to navigate the complex landscape of movie production and distribution effectively.

In [None]:
movies_df.to_csv('../data/movies.csv', index=False)