In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd

df = pd.read_csv('imdb.csv')

df.head()


Unnamed: 0,Rank,Movie_name,Year,Certificate,Runtime_in_min,Genre,Metascore,Gross_in_$_M,Rating_from_10
0,1,The Shawshank Redemption,1994,R,142,Drama,81.0,28.34,9.3
1,2,The Godfather,1972,R,175,"Crime, Drama",100.0,134.97,9.2
2,3,The Dark Knight,2008,PG-13,152,"Action, Crime, Drama",84.0,534.86,9.0
3,4,The Lord of the Rings: The Return of the King,2003,PG-13,201,"Action, Adventure, Drama",94.0,377.85,9.0
4,5,Schindler's List,1993,R,195,"Biography, Drama, History",94.0,96.9,9.0


In [7]:
df.rename(columns={'Rating_from_10': 'Rating'}, inplace=True)

In [8]:
df.head()

Unnamed: 0,Rank,Movie_name,Year,Certificate,Runtime_in_min,Genre,Metascore,Gross_in_$_M,Rating
0,1,The Shawshank Redemption,1994,R,142,Drama,81.0,28.34,9.3
1,2,The Godfather,1972,R,175,"Crime, Drama",100.0,134.97,9.2
2,3,The Dark Knight,2008,PG-13,152,"Action, Crime, Drama",84.0,534.86,9.0
3,4,The Lord of the Rings: The Return of the King,2003,PG-13,201,"Action, Adventure, Drama",94.0,377.85,9.0
4,5,Schindler's List,1993,R,195,"Biography, Drama, History",94.0,96.9,9.0


In [9]:
print(df.isnull().sum())

# Drop rows with missing important columns (like Genre, Rating, etc.)
df.dropna(subset=['Genre', 'Rating'], inplace=True)

# Alternatively, you can fill missing numeric values with the median or mean
df['Metascore'].fillna(df['Metascore'].median(), inplace=True)

Rank                0
Movie_name          0
Year                0
Certificate         7
Runtime_in_min      0
Genre               0
Metascore         160
Gross_in_$_M      159
Rating              0
dtype: int64


In [10]:
# Check which entries in the 'Year' column are not numeric
non_numeric_years = df[~df['Year'].str.isnumeric()]
print(non_numeric_years[['Movie_name', 'Year']])


             Movie_name      Year
37                   96   II 2018
64                Joker    I 2019
69                 Coco    I 2017
149          Inside Out    I 2015
159          The Father    I 2020
213                Pink  III 2016
223                Room    I 2015
235                Rush    I 2013
237           Spotlight    I 2015
345               Mommy    I 2014
384      The Sea Inside    I 2004
477                Baby    I 2015
481              Wonder    I 2017
482             Arrival   II 2016
491          The Artist    I 2011
503             Boyhood    I 2014
524       No Man's Land    I 2001
617               Pride    I 2014
624          About Time    I 2013
650         The Fighter    I 2010
651               Taken    I 2008
652                Once    I 2007
656               Drive    I 2011
660            The Fall    I 2006
669               Crash    I 2004
758             Get Out    I 2017
801             Flipped    I 2010
817                 Ray    I 2004
854           

In [11]:
df['Year'] = df['Year'].str.extract('(\d+)')

# Convert the 'Year' column to integer, coercing errors to NaN (in case of invalid values)
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Check for any remaining NaN values and decide how to handle them
print(df[df['Year'].isna()])

Empty DataFrame
Columns: [Rank, Movie_name, Year, Certificate, Runtime_in_min, Genre, Metascore, Gross_in_$_M, Rating]
Index: []


In [12]:
df.dropna(subset=['Year'], inplace=True)

In [13]:
# Convert 'Rating' to numeric if it's in string format
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Convert 'Year' to integer
df['Year'] = df['Year'].astype(int)

# Check data types after conversion
print(df.dtypes)

Rank               object
Movie_name         object
Year                int32
Certificate        object
Runtime_in_min      int64
Genre              object
Metascore         float64
Gross_in_$_M       object
Rating            float64
dtype: object


In [14]:
genre_df = df['Genre'].str.get_dummies(sep=',')
df = pd.concat([df, genre_df], axis=1)

# One-hot encode the Certificate column (if applicable)
certificate_df = pd.get_dummies(df['Certificate'], prefix='Certificate')
df = pd.concat([df, certificate_df], axis=1)

print(df.head())

  Rank                                     Movie_name  Year Certificate  \
0    1                       The Shawshank Redemption  1994           R   
1    2                                  The Godfather  1972           R   
2    3                                The Dark Knight  2008       PG-13   
3    4  The Lord of the Rings: The Return of the King  2003       PG-13   
4    5                               Schindler's List  1993           R   

   Runtime_in_min                      Genre  Metascore Gross_in_$_M  Rating  \
0             142                      Drama       81.0        28.34     9.3   
1             175               Crime, Drama      100.0       134.97     9.2   
2             152       Action, Crime, Drama       84.0       534.86     9.0   
3             201   Action, Adventure, Drama       94.0       377.85     9.0   
4             195  Biography, Drama, History       94.0        96.90     9.0   

    Action  ...  Certificate_PG  Certificate_PG-13  Certificate_Pass

In [15]:
# Select important columns for similarity
content_features = ['Metascore', 'Gross_in_$_M', 'Rating'] + genre_df.columns.tolist()

# Create a matrix with these features
content_matrix = df[content_features]

# Check the resulting matrix
print(content_matrix.head())

   Metascore Gross_in_$_M  Rating   Action   Adventure   Biography   Comedy  \
0       81.0        28.34     9.3        0           0           0        0   
1      100.0       134.97     9.2        0           0           0        0   
2       84.0       534.86     9.0        0           0           0        0   
3       94.0       377.85     9.0        0           1           0        0   
4       94.0        96.90     9.0        0           0           0        0   

    Crime   Drama   Family  ...  Comedy  Crime  Drama  Family  Fantasy  \
0       0       0        0  ...       0      0      1       0        0   
1       0       1        0  ...       0      1      0       0        0   
2       1       1        0  ...       0      0      0       0        0   
3       0       1        0  ...       0      0      0       0        0   
4       0       1        0  ...       0      0      0       0        0   

   Film-Noir  Horror  Mystery  Thriller  Western  
0          0       0        0

In [16]:
# Check for rows that contain non-numeric values in the 'Metascore', 'Gross_in_$_M', and 'Rating' columns
non_numeric_rows = df[~df['Metascore'].apply(pd.to_numeric, errors='coerce').notnull()]
print(non_numeric_rows[['Movie_name', 'Metascore']])

Empty DataFrame
Columns: [Movie_name, Metascore]
Index: []


In [17]:
# Remove non-numeric characters from 'Gross_in_$_M' and convert to numeric
df['Gross_in_$_M'] = df['Gross_in_$_M'].replace(r'[^\d.]', '', regex=True).astype(float)

# If 'Metascore' has any invalid values (like strings), convert to NaN and handle them
df['Metascore'] = pd.to_numeric(df['Metascore'], errors='coerce')

# Similarly, handle invalid values in 'Rating' (if applicable)
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# After conversion, check for any rows with NaN values
print(df.isnull().sum())


Rank                   0
Movie_name             0
Year                   0
Certificate            7
Runtime_in_min         0
                      ..
Certificate_TV-MA      0
Certificate_TV-PG      0
Certificate_U          0
Certificate_UA         0
Certificate_Unrated    0
Length: 62, dtype: int64


In [18]:
# Drop rows with NaN values in critical columns
df.dropna(subset=['Metascore', 'Gross_in_$_M', 'Rating'], inplace=True)

# Or, fill NaN values with the column mean (or median)
df['Metascore'].fillna(df['Metascore'].mean(), inplace=True)
df['Gross_in_$_M'].fillna(df['Gross_in_$_M'].mean(), inplace=True)
df['Rating'].fillna(df['Rating'].mean(), inplace=True)

# Check if NaN values are handled
print(df.isnull().sum())


Rank                   0
Movie_name             0
Year                   0
Certificate            1
Runtime_in_min         0
                      ..
Certificate_TV-MA      0
Certificate_TV-PG      0
Certificate_U          0
Certificate_UA         0
Certificate_Unrated    0
Length: 62, dtype: int64


In [19]:
from sklearn.preprocessing import StandardScaler

# Normalize the numeric features
scaler = StandardScaler()
normalized_content_matrix = scaler.fit_transform(df[['Metascore', 'Gross_in_$_M', 'Rating']])

# Check the normalized matrix
print(normalized_content_matrix[:5])


[[ 0.22291447 -0.39046842  4.67866247]
 [ 1.88040647  0.55836582  4.32690535]
 [ 0.48462373  4.11673895  3.62339111]
 [ 1.35698794  2.71960433  3.62339111]
 [ 1.35698794  0.2196045   3.62339111]]


In [20]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between movies based on the content features
movie_similarity = cosine_similarity(normalized_content_matrix)

# Create a DataFrame to store the similarity matrix
similarity_df = pd.DataFrame(movie_similarity, index=df['Movie_name'], columns=df['Movie_name'])

# Check the similarity matrix
print(similarity_df.head())


Movie_name                                     The Shawshank Redemption  \
Movie_name                                                                
The Shawshank Redemption                                       1.000000   
The Godfather                                                  0.915611   
The Dark Knight                                                0.597170   
The Lord of the Rings: The Return of the King                  0.728475   
Schindler's List                                               0.942587   

Movie_name                                     The Godfather  The Dark Knight  \
Movie_name                                                                      
The Shawshank Redemption                            0.915611         0.597170   
The Godfather                                       1.000000         0.722137   
The Dark Knight                                     0.722137         1.000000   
The Lord of the Rings: The Return of the King       0.878952         

In [21]:
def recommend_movies(movie_name, similarity_df, top_n=5):
    # Get similarity scores for the given movie
    similar_movies = similarity_df[movie_name]
    
    # Sort the movies by similarity in descending order
    similar_movies = similar_movies.sort_values(ascending=False)
    
    # Get the top N most similar movies (excluding the input movie)
    top_similar_movies = similar_movies.iloc[1:top_n+1]
    
    print(f"Top {top_n} recommendations for '{movie_name}':")
    print(top_similar_movies)

# Call the function with a movie name
recommend_movies("The Godfather", similarity_df, top_n=5)


Top 5 recommendations for 'The Godfather':
Movie_name
Pulp Fiction        0.999459
Schindler's List    0.996813
Alien               0.994843
Aliens              0.994024
Hamilton            0.989338
Name: The Godfather, dtype: float64
