In [None]:
# Importing all required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Setting the working directory
os.chdir('C:/Users/Adi Malhotra/Desktop/IMDB')

In [None]:
# Taking input from the IMDB folder created in the above path
data = pd.read_csv("imdb_1000.csv")

In [None]:
# Top 5 entries of the dataset
data.head()

In [None]:
# Summary of the Data stucture
data.info()

In [None]:
# check the number of rows and columns
data.shape

In [None]:
# Statistical Summary of the dataset
data.describe()


In [None]:
# check the data type of each variable
data.dtypes

In [None]:
# calculate the average movie duration
data['duration'].mean()


In [None]:
# sort the DataFrame by duration to find the shortest and longest movie
data.sort_values('duration')

In [None]:
#Sorting the dataset to find the highest to least rated movies
#without using .head(), one can see the full list. The .head() operator allows to see only top 5 values
data_sort= data.sort_values(by= 'star_rating', ascending= False).head()
data_sort.head()

In [None]:
# create a histogram of duration, choosing an "appropriate" number of bins
data['duration'].plot(kind='hist', bins=30)
#Naming the X axis title
plt.xlabel('Movie Duration')

In [None]:
#Probability Distribution Function (pdf) along with histogram
sns.distplot(data['duration'], bins= 10)

In [None]:
# use a box plot to display that same data
data['duration'].plot(kind='box')

In [None]:
# Number of movies by distinct genre
make_dist = data.groupby('genre').size()
make_dist

In [None]:
#Frewuency distribution of each genre
make_dist.plot(title=" Distribution of genres")

In [None]:
# Selecting only numeric variables
data_num = data.select_dtypes(include = ["float64", "int64"])
data_num.head()

In [None]:
#Histogram of each numeric variable
data_num.hist(bins= 20)

In [None]:
#Computation of correlation coefficient of all numeric variables
data_corr = data_num.corr()
data_corr

In [None]:
#Pairplot of all numeric variables to identify the relationships, if any
sns.pairplot(data_num)

In [None]:
#Heatmap plot using correlation coefficient
sns.heatmap(data_corr, annot = True)

In [None]:
# Visualization to display data, including a title and x and y labels
data[['content_rating','title']].groupby('content_rating').count().plot(kind='bar', title='Content Rating Visualization')
plt.xlabel('Content Rating')
plt.ylabel('Title Count')

In [None]:
# count the number of missing values in each column
data.isnull().sum(axis=0)

In [None]:
# if there are missing values: examine them, then fill them in with "reasonable" values
data[data['content_rating'].isnull()]
data.at[(187,649),'content_rating'] = 'PG'
data.at[936,'content_rating'] = 'PG-13'

In [None]:
# calculate the average star rating for movies 2 hours or longer,
# and compare that with the average star rating for movies shorter than 2 hours

data[data['duration'] < 120]['star_rating'].mean()

print('Avg. star rating for movies 2 hours or longer: ', data[data['duration'] >= 120]['star_rating'].mean(), 
      '\nAvg. star rating for movies shorter than 2 hours: ', data[data['duration'] < 120]['star_rating'].mean())

In [None]:
# calculate the average duration for each genre
data[['duration','genre']].groupby('genre').mean()

In [None]:
# Boxplot to detect whether there is a relationship between duration and star rating
box1= sns.boxplot (x= 'star_rating', y = "duration", data= data)

In [None]:
# determine the top rated movie (by star rating) for each genre
#Using .first() for Alphabetical order of genres
data.sort_values('star_rating', ascending=False).groupby('genre')['title','star_rating'].first()

In [None]:
# check if there are multiple movies with the same title, and if so, determine if they are actually duplicates
result = data[data['title'].isin(data[data.duplicated(['title'])]['title'])]
result.sort_values('title')

In [None]:
# calculate the average star rating for each genre, but only include genres with at least 10 movies
genres = data['genre'].value_counts()[data['genre'].value_counts() > 10].index
data[data['genre'].isin(genres)].groupby('genre')['star_rating'].mean()

In [None]:
# Number of movies done by each actor
def repp(string):
    return string.replace("[","").replace("]","").replace("u'","").replace("',",",")[:-1]
    
movies_series = data['actors_list'].apply(repp)

actors_list = []
for movie_actors in movies_series:
    actors_list.append([e.strip() for e in movie_actors.split(',')])
    
actor_dict = {}
for actor in actors_list:
    for a in actor:
        if a in actor_dict:
            actor_dict[a] +=1
        else:
            actor_dict[a] = 1

actor_dict


In [None]:
# Number of movies done by all actors sorted by the count of movies
movies_series

In [None]:
#regression plot
sns.regplot(data["star_rating"], data["duration"])