#### MOVIES DATA EDA AND CORRELATION ANALYSIS

In [1]:
#Importing packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

In [2]:
movies = pd.read_csv("movies.csv")

In [3]:
#Getting to know the data
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7591 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7666 non-null   object 
 5   score     7665 non-null   float64
 6   votes     7665 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7665 non-null   object 
 9   star      7667 non-null   object 
 10  country   7665 non-null   object 
 11  budget    5497 non-null   float64
 12  gross     7479 non-null   float64
 13  company   7651 non-null   object 
 14  runtime   7664 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB


In [4]:
movies.head(20)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0
5,Friday the 13th,R,Horror,1980,"May 9, 1980 (United States)",6.4,123000.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,United States,550000.0,39754601.0,Paramount Pictures,95.0
6,The Blues Brothers,R,Action,1980,"June 20, 1980 (United States)",7.9,188000.0,John Landis,Dan Aykroyd,John Belushi,United States,27000000.0,115229890.0,Universal Pictures,133.0
7,Raging Bull,R,Biography,1980,"December 19, 1980 (United States)",8.2,330000.0,Martin Scorsese,Jake LaMotta,Robert De Niro,United States,18000000.0,23402427.0,Chartoff-Winkler Productions,129.0
8,Superman II,PG,Action,1980,"June 19, 1981 (United States)",6.8,101000.0,Richard Lester,Jerry Siegel,Gene Hackman,United States,54000000.0,108185706.0,Dovemead Films,127.0
9,The Long Riders,R,Biography,1980,"May 16, 1980 (United States)",7.0,10000.0,Walter Hill,Bill Bryden,David Carradine,United States,10000000.0,15795189.0,United Artists,100.0


In [5]:
#Finding percentage of missing values
for col in movies:
    pct_missing = np.mean(movies[col].isnull())
    print('{} - {}%'.format(col, pct_missing*100))

name - 0.0%
rating - 1.004173187271779%
genre - 0.0%
year - 0.0%
released - 0.02608242044861763%
score - 0.03912363067292645%
votes - 0.03912363067292645%
director - 0.0%
writer - 0.03912363067292645%
star - 0.013041210224308816%
country - 0.03912363067292645%
budget - 28.31246739697444%
gross - 2.464788732394366%
company - 0.2217005738132499%
runtime - 0.05216484089723526%


In [6]:
movies.isna().sum()

name           0
rating        77
genre          0
year           0
released       2
score          3
votes          3
director       0
writer         3
star           1
country        3
budget      2171
gross        189
company       17
runtime        4
dtype: int64

In [7]:
#Treating missing values in numeric columns
continuous_cols = ['gross', 'budget', 'votes', 'score', 'runtime']

for col in continuous_cols:
    mean = movies[col].mean()
    movies[col].fillna(mean, inplace = True)

In [8]:
#Treating missing values in categorical columns
movies['rating'].fillna(np.random.choice(['R', 'PG', 'G', 'Not Rated', 'NC-17', 'Approved', 'TV-PG',
       'PG-13', 'Unrated', 'X', 'TV-MA', 'TV-14']), inplace= True)

In [9]:
movies.rating.unique()

array(['R', 'PG', 'G', 'TV-MA', 'Not Rated', 'NC-17', 'Approved', 'TV-PG',
       'PG-13', 'Unrated', 'X', 'TV-14'], dtype=object)

In [10]:
#Correcting the year of release
movies['correct_year_released'] = movies.released.str.extract(r'(\d{4})')

In [11]:
movies.drop(columns= 'year', inplace = True)

In [12]:
#Checking for duplicates
movies.duplicated().sum()

0

In [13]:
#Changing data types
movies["votes"] = movies["votes"].astype('int64')
movies["budget"] = movies["budget"].astype('int64')
movies["gross"] = movies["gross"].astype('int64')
movies["runtime"] = movies["runtime"].astype('int64')

In [14]:
pd.set_option('display.max_rows', None)

#### Exploratory Data Analysis

In [15]:
ratings = movies.rating.value_counts()*100/movies.shape[0]
formatted_ratings = ratings.apply(lambda x: f'{x:0.2f}%')
formatted_ratings

rating
R            48.21%
PG-13        27.54%
PG           16.33%
Not Rated     3.69%
G             2.00%
TV-MA         1.12%
Unrated       0.68%
NC-17         0.30%
TV-PG         0.07%
X             0.04%
Approved      0.01%
TV-14         0.01%
Name: count, dtype: object

In [16]:
px.bar(data_frame= ratings, x= ratings.index, y= ratings*100/sum(ratings), labels = {'index':'Rating', 'y':'Frequency %'},
           title= 'Frequency Distribution of Movie Ratings', text = ratings.apply(lambda x: f"{x / sum(ratings) * 100: 0.2f}%"),
            color=ratings.index, width = 900, height = 500, template= 'plotly')

NameError: name 'px' is not defined

In [None]:
genre = movies.genre.value_counts()*100/movies.shape[0]
genre_pct= genre.apply(lambda x: f'{x: 0.2f}%')
genre_pct

In [None]:
px.bar(data_frame= genre, x= genre.index, y= genre*100/sum(genre), labels = {'index':'Movie Genre', 'y':'Frequency %'},
           title= 'Movie Genre Popularity', text = genre.apply(lambda x: f"{x / sum(genre) * 100: 0.2f}%"),
             width = 900, height = 500, template= 'plotly')

In [None]:
year = movies.correct_year_released.value_counts().sort_index()
year.head(10)

In [None]:
a = px.area(data_frame= year, x= year.index, y=year, labels = {'index':'Year', 'y':'Number of Movies Released'},
           title= 'Movie Counts by Year',
             width = 800, height = 500, template= 'plotly' )
a.update_layout(xaxis=dict(tickvals=[1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020],
                        ticktext = ['1980', '1985', '1990', '1995', '2000', '2005', '2010', '2015', '2020']))

In [None]:
px.histogram(data_frame= movies.score, width = 700, height = 500, title= 'Frequency Distribution of Scores', template= 'plotly_dark',
            labels= {'value':'Scores'})

In [None]:
top_directors = movies.director.value_counts().head(10)
top_directors

In [None]:
px.bar(data_frame= top_directors, x= top_directors, y=top_directors.index, labels = {'y':'Movie Directors', 'x':'Number of Movies'},
           title= 'Top 10 Directors by Number of Movies',
             width = 800, height = 500, template= 'plotly', text = top_directors, orientation= 'h' )

In [None]:
px.histogram(data_frame= movies.runtime, width = 800, height = 500, template= 'plotly_dark', 
              labels = {'count':'Frequency', 'value':'Movie Runtime'},
           title= 'Frequency Distribution of Movie Runtime', )

##### Top 10 movies based on scores

In [None]:
top_ten_movies = movies.loc[:, ['name', 'score']].sort_values(by = 'score', ascending = False).head(10)
top_ten_movies

##### Top 10 Companies by Average Gross Revenue

In [None]:
top_ten_companies = movies.groupby('company')['gross'].mean().sort_values( ascending = False).head(10).reset_index()
top_ten_companies

In [None]:
px.bar(data_frame= top_ten_companies, x= top_ten_companies.gross, y=top_ten_companies.company, 
       labels = {'y':'Film Production', 'x':'Average Gross Earnings'},
           title= 'Top 10 Film Productions by Average Gross Earnings',
             width = 900, height = 500, template= 'plotly', orientation= 'h', color = 'company', text = top_ten_companies.gross )

In [None]:
b = movies.groupby('correct_year_released', as_index= False)['genre'].value_counts()
filter = b.groupby('correct_year_released')['count'].nlargest(3).droplevel(0).index
b = b.iloc[filter]
b = b.loc[b.correct_year_released >='2017', :]
b

In [None]:
px.bar(data_frame= b, x= 'correct_year_released', y = 'count', color = 'genre', width = 500, height = 600, template ='ggplot2',
                      labels= {'correct_year_released':'Year'}, title= 'Most Popular Genres in Each Year From 2017')

#### Correlation Analysis: What factors influence Gross Earnings of a Film?

In [None]:
#Seeing correlation between numeric columns
correlation = movies.select_dtypes(['int64', 'float64']).corr()
correlation

In [None]:
import plotly.express as px

fig = px.imshow(correlation, text_auto=True)
fig.update_layout(title='Correlation Matrix', width = 800, height =600)

From the correlation matrix, we can tell that gross earnings has a strong positive correlation with budget(0.74) and votes(0.63)

In [None]:
#Seeing the correlation of categorical columns with gross earnings
movies1 = movies
for col in movies1:
    if movies1[col].dtype == 'object':
        movies1[col] = movies[col].astype('category')
        movies1[col] = movies[col].cat.codes
movies1

In [None]:
correlation_matrix2 = movies1.corr()
correlation_matrix2

In [None]:
px.imshow(correlation_matrix2, width = 1100, height = 1100, text_auto= True, title = 'Correlation Matrix for Movies',
         color_continuous_scale = 'viridis')

In [None]:
#Understanding the relationship between gross earnings and budget
px.scatter(data_frame=movies, x= movies.gross, y= movies.budget, trendline = 'ols', color = movies.gross, 
           template = 'seaborn', width = 800, height = 600, title = 'Budget vs Gross Earnings', labels= {'gross':'gross earnings'})

The more the budget increases, the more is the gross earnings expected from a film

In [None]:
#Understanding the relationship between votes and gross earnings
px.scatter(data_frame=movies, x= movies.gross, y= movies.votes, trendline = 'ols', color = movies.gross, 
           template = 'seaborn', width = 800, height = 600, title = 'Votes vs Gross Earnings', labels= {'gross':'gross earnings'})

As expected, if a movies garners more votes(i.e, the number of people who have watched the movie), its gross earnings increase.