In [None]:
!pip install wordcloud

In [None]:
!pip install plotly

In [None]:
%matplotlib inline
from IPython.display import Image, HTML
import json
import datetime
import ast
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud, STOPWORDS
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings
warnings.filterwarnings('ignore')
plotly.tools.set_credentials_file(username='rounakbanik', api_key='xTLaHBy9MVv5szF4Pwan')

sns.set_style('whitegrid')
sns.set(font_scale=1.25)
pd.set_option('display.max_colwidth', 50)

In [None]:
movie_data = pd.read_csv('dataset/movie_metadata_cleaned.csv')

In [None]:
movie_data.head()

In [None]:
movie_data.describe()

In [None]:
#formatting csv columns that are in json/dict format and converting them to lists
from ast import literal_eval
movie_data['genres']=movie_data['genres'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])
movie_data['production_companies']=movie_data['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])
movie_data['production_countries']=movie_data['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])
movie_data['spoken_languages']=movie_data['spoken_languages'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])
movie_data['belongs_to_collection']=movie_data['belongs_to_collection'].fillna('[]').apply(literal_eval).apply(lambda x:x['name'] if isinstance(x,dict) else np.nan)

In [None]:
movie_data

In [None]:
movie_data['return'] = movie_data['revenue'] / movie_data['budget']

In [None]:
# WordClouds
movie_data['title'] = movie_data['title'].astype('str')
movie_data['overview'] = movie_data['overview'].astype('str')

In [None]:
title_corpus = ' '.join(movie_data['title'])
overview_corpus = ' '.join(movie_data['overview'])

In [None]:
title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(title_corpus)
plt.figure(figsize=(16,8))
plt.imshow(title_wordcloud)
plt.axis('off')
plt.show()

In [None]:
overview_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(overview_corpus)
plt.figure(figsize=(16,8))
plt.imshow(overview_wordcloud)
plt.axis('off')
plt.show()

In [None]:
# Production Countries 
s = movie_data.apply(lambda x: pd.Series(x['production_countries']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'countries'

In [None]:
con_movie_data = movie_data.drop('production_countries', axis=1).join(s)
con_movie_data = pd.DataFrame(con_movie_data['countries'].value_counts())
con_movie_data['country'] = con_movie_data.index
con_movie_data.columns = ['num_movies', 'country']
con_movie_data = con_movie_data.reset_index().drop('index', axis=1)
con_movie_data.head(10)

In [None]:
# Franchise Movie 
fran = movie_data[movie_data['belongs_to_collection'].notnull()]
fran = fran[fran['belongs_to_collection'].notnull()]

In [None]:
fran_pivot = fran.pivot_table(index='belongs_to_collection', values='revenue', aggfunc={'revenue': ['mean', 'sum', 'count']}).reset_index()

In [None]:
#Most Successful Movie Franchises (by Average Gross)
fran_pivot.sort_values('sum', ascending=False).head(10)

In [None]:
# Production Companies 
h = movie_data.apply(lambda x: pd.Series(x['production_companies']),axis=1).stack().reset_index(level=1, drop=True)
h.name = 'companies'

In [None]:
com = movie_data.drop('production_companies', axis=1).join(h)

In [None]:
com_sum = pd.DataFrame(com.groupby('companies')['revenue'].sum().sort_values(ascending=False))
com_sum.columns = ['Total']
com_mean = pd.DataFrame(com.groupby('companies')['revenue'].mean().sort_values(ascending=False))
com_mean.columns = ['Average']
com_count = pd.DataFrame(com.groupby('companies')['revenue'].count().sort_values(ascending=False))
com_count.columns = ['Number']

com_pivot = pd.concat((com_sum, com_mean, com_count), axis=1)

In [None]:
#Highest Earning Production Companies
com_pivot.sort_values('Total', ascending=False).head(10)

In [None]:
# Most Voted Movies
def clean_numeric(x):
    try:
        return float(x)
    except:
        return np.nan
    
movie_data['vote_count'] = movie_data['vote_count'].apply(clean_numeric).astype('float')
movie_data['vote_average'] = movie_data['vote_average'].apply(clean_numeric).astype('float')

In [None]:
movie_data[['title', 'vote_count', 'release_date']].sort_values('vote_count', ascending=False).head(10)

In [None]:
sns.distplot(movie_data['vote_average'].fillna(movie_data['vote_average'].median()))

In [None]:
movie_data[movie_data['vote_count'] > 2000][['title', 'vote_average', 'vote_count' ,'release_date']].sort_values('vote_average', ascending=False).head(10)

In [None]:
#There is a very small correlation between Vote Count and Vote Average. A large number of votes on a particular movie does not necessarily imply that the movie is good.
sns.jointplot(x='vote_average', y='vote_count', data=movie_data)

In [None]:
# Movie Release Dates
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

In [None]:
def get_month(x):
    try:
        return month_order[int(str(x).split('/')[1]) - 1]
    except:
        return np.nan

In [None]:
def get_day(x):
    try:
        month, day, year = (int(i) for i in x.split('/'))    
        answer = datetime.date(month, day, year).weekday()
        return day_order[answer]
    except:
        return np.nan

In [None]:
movie_data['day'] = movie_data['release_date'].apply(get_day)
movie_data['month'] = movie_data['release_date'].apply(get_month)

In [None]:
plt.figure(figsize=(12,6))
plt.title("Number of Movies released in a particular month.")
sns.countplot(x='month', data=movie_data, order=month_order)

In [None]:
month_mean = pd.DataFrame(movie_data[movie_data['revenue'] > 1e8].groupby('month')['revenue'].mean())
month_mean['mon'] = month_mean.index
plt.figure(figsize=(12,6))
plt.title("Average Gross by the Month for Blockbuster Movies")
sns.barplot(x='mon', y='revenue', data=month_mean, order=month_order)

In [None]:
# Boxpolt
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(15, 8))
sns.boxplot(x='month', y='return', data=movie_data[movie_data['return'].notnull()], palette="muted", ax =ax, order=month_order)
ax.set_ylim([0, 12])

In [None]:
# Days
plt.figure(figsize=(10,5))
plt.title("Number of Movies released on a particular day.")
sns.countplot(x='day', data=movie_data, order=day_order)