In [1]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [46]:
df = pd.read_csv('movies.csv')
print("Data Loaded Successfully!")
df.head()

Data Loaded Successfully!


Unnamed: 0,imdbID,title,year,rating,runtime,genre,released,director,writer,cast,...,imdbRating,imdbVotes,poster,plot,fullplot,language,country,awards,lastupdated,type
0,1,Carmencita,1894,NOT RATED,1 min,"Documentary, Short",,William K.L. Dickson,,Carmencita,...,5.9,1032.0,https://m.media-amazon.com/images/M/MV5BMjAzND...,Performing on what looks like a small wooden s...,Performing on what looks like a small wooden s...,,USA,,2015-08-26 00:03:45.040000000,movie
1,5,Blacksmith Scene,1893,UNRATED,1 min,Short,1893-05-09,William K.L. Dickson,,"Charles Kayser, John Ott",...,6.2,1189.0,,Three men hammer on an anvil and pass a bottle...,A stationary camera looks at a large anvil wit...,,USA,1 win.,2015-08-26 00:03:50.133000000,movie
2,3,Pauvre Pierrot,1892,,4 min,"Animation, Comedy, Short",1892-10-28,�mile Reynaud,,,...,6.7,566.0,,"One night, Arlequin come to see his lover Colo...","One night, Arlequin come to see his lover Colo...",,France,,2015-08-12 00:06:02.720000000,movie
3,8,Edison Kinetoscopic Record of a Sneeze,1894,,1 min,"Documentary, Short",1894-01-09,William K.L. Dickson,,Fred Ott,...,5.9,988.0,,A man (Thomas Edison's assistant) takes a pinc...,A man (Edison's assistant) takes a pinch of sn...,,USA,,2015-08-10 00:21:07.127000000,movie
4,10,Employees Leaving the Lumi�re Factory,1895,,1 min,"Documentary, Short",1895-03-22,Louis Lumi�re,,,...,6.9,3469.0,,A man opens the big gates to the Lumi�re facto...,A man opens the big gates to the Lumi�re facto...,,France,,2015-08-26 00:03:56.603000000,movie


In [8]:
print("\nDataset Info:")
(df.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46014 entries, 0 to 46013
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   imdbID       46014 non-null  int64  
 1   title        46014 non-null  object 
 2   year         46014 non-null  object 
 3   rating       24629 non-null  object 
 4   runtime      43377 non-null  object 
 5   genre        45615 non-null  object 
 6   released     43002 non-null  object 
 7   director     45409 non-null  object 
 8   writer       42942 non-null  object 
 9   cast         44928 non-null  object 
 10  metacritic   9401 non-null   float64
 11  imdbRating   44300 non-null  float64
 12  imdbVotes    44299 non-null  float64
 13  poster       35798 non-null  object 
 14  plot         42161 non-null  object 
 15  fullplot     41421 non-null  object 
 16  language     44899 non-null  object 
 17  country      45830 non-null  object 
 18  awards       23542 non-null  ob

In [24]:
df.shape

(46014, 21)

In [9]:
print("\nStatistical Summary:")
(df.describe())


Statistical Summary:


Unnamed: 0,imdbID,metacritic,imdbRating,imdbVotes
count,46014.0,9401.0,44300.0,44299.0
mean,855206.2,56.896288,6.381454,11800.55
std,1141979.0,17.715373,1.183113,49344.99
min,1.0,1.0,1.1,5.0
25%,81657.5,45.0,5.8,326.0
50%,252487.5,58.0,6.6,999.0
75%,1370868.0,70.0,7.2,3829.0
max,5023260.0,100.0,9.6,1521105.0


In [25]:
df.isna().sum()

imdbID             0
title              0
year               0
rating         21385
runtime         2637
genre            399
released        3012
director         605
writer          3072
cast            1086
metacritic     36613
imdbRating      1714
imdbVotes       1715
poster         10216
plot            3853
fullplot        4593
language        1115
country          184
awards         22472
lastupdated        0
type              38
dtype: int64

In [40]:
df['runtime'] = df['runtime'].astype(str).str.extract(r'(\d+)').astype(float)
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])
df['runtime'] = df['runtime'].fillna(df['runtime'].mean())
df['genre'] = df['genre'].fillna('Unknown')
df['director'] = df['director'].fillna('Unknown')
df['writer'] = df['writer'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')
df['imdbRating'] = df['imdbRating'].fillna(df['imdbRating'].mean())
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df.dropna(subset=['imdbRating'], inplace=True)
print("\nData after handling missing values and incorrect data types:")
df.info()


Data after handling missing values and incorrect data types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46014 entries, 0 to 46013
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   imdbID       46014 non-null  int64  
 1   title        46014 non-null  object 
 2   year         45938 non-null  float64
 3   rating       46014 non-null  object 
 4   runtime      46014 non-null  float64
 5   genre        46014 non-null  object 
 6   released     43002 non-null  object 
 7   director     46014 non-null  object 
 8   writer       46014 non-null  object 
 9   cast         46014 non-null  object 
 10  metacritic   9401 non-null   float64
 11  imdbRating   46014 non-null  float64
 12  imdbVotes    44299 non-null  float64
 13  poster       35798 non-null  object 
 14  plot         42161 non-null  object 
 15  fullplot     41421 non-null  object 
 16  language     44899 non-null  object 
 17  country      45830 non-nul

In [45]:
df=pd.concat([df],axis=1)
df.dropna()
df.head()

Unnamed: 0,imdbID,title,year,rating,runtime,genre,released,director,writer,cast,...,imdbRating,imdbVotes,poster,plot,fullplot,language,country,awards,lastupdated,type
432,17136,Metropolis,1927,NOT RATED,153 min,"Drama, Sci-Fi",1927-03-13,Fritz Lang,"Thea von Harbou (screenplay), Thea von Harbou ...","Alfred Abel, Gustav Fr�hlich, Rudolf Klein-Rog...",...,8.3,99845.0,https://m.media-amazon.com/images/M/MV5BNDAzNT...,In a futuristic city sharply divided between t...,"Sometime in the future, the city of Metropolis...",German,Germany,3 wins & 4 nominations.,2015-09-04 00:09:30.553000000,movie
1458,27977,Modern Times,1936,G,87 min,"Comedy, Drama",1936-02-25,Charles Chaplin,Charles Chaplin,"Charles Chaplin, Paulette Goddard, Henry Bergm...",...,8.6,123603.0,https://m.media-amazon.com/images/M/MV5BMjAyMT...,The Tramp struggles to live in modern industri...,"Chaplins last 'silent' film, filled with sound...",English,USA,3 wins & 1 nomination.,2015-09-04 00:06:38.867000000,movie
1624,29453,P�p� le Moko,1937,NOT RATED,94 min,"Crime, Drama, Romance",1941-03-03,Julien Duvivier,"Henri La Barthe (novel), Henri La Barthe (scen...","Jean Gabin, Gabriel Gabrio, Saturnin Fabre, Fe...",...,7.8,4398.0,https://m.media-amazon.com/images/M/MV5BMTY5NT...,P�p� le Moko is a gangster from Paris that hid...,P�p� le Moko is a gangster from Paris that hid...,"French, Arabic",France,2 wins.,2015-09-10 17:45:12.290000000,movie
1675,29843,The Adventures of Robin Hood,1938,PG,102 min,"Action, Adventure, Romance",1938-05-14,"Michael Curtiz, William Keighley",Norman Reilly Raine (original screenplay: base...,"Errol Flynn, Olivia de Havilland, Basil Rathbo...",...,8.0,36789.0,https://m.media-amazon.com/images/M/MV5BMTUxMz...,When Prince John and the Norman Lords begin op...,"Sir Robin of Locksley, defender of downtrodden...",English,USA,Won 3 Oscars. Another 2 wins & 2 nominations.,2015-08-14 00:11:51.833000000,movie
1929,32138,The Wizard of Oz,1939,PASSED,102 min,"Adventure, Family, Fantasy",1939-08-25,"Victor Fleming, George Cukor, Mervyn LeRoy, No...","Noel Langley (screenplay), Florence Ryerson (s...","Judy Garland, Frank Morgan, Ray Bolger, Bert Lahr",...,8.1,262132.0,https://m.media-amazon.com/images/M/MV5BMTU0MT...,Dorothy Gale is swept away to a magical land i...,In this charming film based on the popular L. ...,English,USA,Won 2 Oscars. Another 7 wins & 13 nominations.,2015-09-01 00:29:05.490000000,movie
