In [124]:
import pandas as pd
import numpy as np

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [126]:
df= pd.read_csv('movie_data.csv')
df['movie_title'] = df['movie_title'].str.strip()



In [127]:
df.head()


Unnamed: 0,index,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,imdb_score
0,0,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,Wes Studi,7.9
1,1,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,Jack Davenport,7.1
2,2,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,Stephanie Sigman,6.8
3,3,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,Joseph Gordon-Levitt,8.5
4,4,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,,7.1


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         5043 non-null   int64  
 1   actor_2_name  5030 non-null   object 
 2   genres        5043 non-null   object 
 3   actor_1_name  5036 non-null   object 
 4   movie_title   5043 non-null   object 
 5   actor_3_name  5020 non-null   object 
 6   imdb_score    5043 non-null   float64
dtypes: float64(1), int64(1), object(5)
memory usage: 275.9+ KB


In [129]:
print(df.isnull().sum())

index            0
actor_2_name    13
genres           0
actor_1_name     7
movie_title      0
actor_3_name    23
imdb_score       0
dtype: int64


In [130]:
missing_all_actors = df[['actor_1_name', 'actor_2_name', 'actor_3_name']].isnull().all(axis=1)


In [131]:
print(df[missing_all_actors])

      index actor_2_name              genres actor_1_name  \
4502   4502          NaN         Documentary          NaN   
4519   4519          NaN   Documentary|Drama          NaN   
4720   4720          NaN         Documentary          NaN   
4837   4837          NaN         Documentary          NaN   
4945   4945          NaN  Documentary|Family          NaN   
4946   4946          NaN     Documentary|War          NaN   
4990   4990          NaN         Documentary          NaN   

                  movie_title actor_3_name  imdb_score  
4502       Pink Ribbons, Inc.          NaN         7.4  
4519       Sex with Strangers          NaN         4.7  
4720   The Harvest/La Cosecha          NaN         7.2  
4837   Ayurveda: Art of Being          NaN         7.6  
4945     The Brain That Sings          NaN         8.2  
4946  The Blood of My Brother          NaN         6.6  
4990                 Counting          NaN         6.0  


In [132]:
df.fillna('Unknown',inplace=True)

In [133]:
print(df[missing_all_actors])

      index actor_2_name              genres actor_1_name  \
4502   4502      Unknown         Documentary      Unknown   
4519   4519      Unknown   Documentary|Drama      Unknown   
4720   4720      Unknown         Documentary      Unknown   
4837   4837      Unknown         Documentary      Unknown   
4945   4945      Unknown  Documentary|Family      Unknown   
4946   4946      Unknown     Documentary|War      Unknown   
4990   4990      Unknown         Documentary      Unknown   

                  movie_title actor_3_name  imdb_score  
4502       Pink Ribbons, Inc.      Unknown         7.4  
4519       Sex with Strangers      Unknown         4.7  
4720   The Harvest/La Cosecha      Unknown         7.2  
4837   Ayurveda: Art of Being      Unknown         7.6  
4945     The Brain That Sings      Unknown         8.2  
4946  The Blood of My Brother      Unknown         6.6  
4990                 Counting      Unknown         6.0  


In [134]:
df['actors']=df[['actor_1_name','actor_2_name','actor_3_name']].apply(lambda x:','.join(x.dropna()),axis=1)

In [135]:
df.head()

Unnamed: 0,index,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,imdb_score,actors
0,0,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,Wes Studi,7.9,"CCH Pounder,Joel David Moore,Wes Studi"
1,1,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,Jack Davenport,7.1,"Johnny Depp,Orlando Bloom,Jack Davenport"
2,2,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,Stephanie Sigman,6.8,"Christoph Waltz,Rory Kinnear,Stephanie Sigman"
3,3,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,Joseph Gordon-Levitt,8.5,"Tom Hardy,Christian Bale,Joseph Gordon-Levitt"
4,4,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,Unknown,7.1,"Doug Walker,Rob Walker,Unknown"


In [136]:
genresnum = pd.get_dummies(df['genres'])

In [137]:
df = pd.concat([df, genresnum], axis=1)

In [138]:
df.head()

Unnamed: 0,index,actor_2_name,genres,actor_1_name,movie_title,actor_3_name,imdb_score,actors,Action,Action|Adventure,...,Mystery|Western,Romance,Romance|Sci-Fi|Thriller,Romance|Short,Sci-Fi,Sci-Fi|Thriller,Thriller,Thriller|War,Thriller|Western,Western
0,0,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,Wes Studi,7.9,"CCH Pounder,Joel David Moore,Wes Studi",False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,Jack Davenport,7.1,"Johnny Depp,Orlando Bloom,Jack Davenport",False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,Stephanie Sigman,6.8,"Christoph Waltz,Rory Kinnear,Stephanie Sigman",False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,Joseph Gordon-Levitt,8.5,"Tom Hardy,Christian Bale,Joseph Gordon-Levitt",False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,Unknown,7.1,"Doug Walker,Rob Walker,Unknown",False,False,...,False,False,False,False,False,False,False,False,False,False


In [139]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform on the entire dataset
tfidf_matrix = tfidf.fit_transform(df['actors'])


Recommendation Fucntion

In [140]:
# Function to get movie recommendations based on actors and genre
def get_recommendations(actors, genre, top_n=10):
    # Transform the input actors using the same TF-IDF vectorizer
    input_vector = tfidf.transform([actors])
    
    # Compute cosine similarity between input vector and entire dataset
    cosine_sim = cosine_similarity(input_vector, tfidf_matrix).flatten()
    
    # Filter movies by genre
    genre_filtered_df = df[df['genres'].str.contains(genre)]
    
    if genre_filtered_df.empty:
        return []

    # Get similarity scores for the filtered genre movies
    genre_cosine_sim = cosine_sim[genre_filtered_df.index]
    
    # Combine cosine similarity score and IMDb rating
    ratings = genre_filtered_df['imdb_score'].values
    combined_score = genre_cosine_sim + (ratings / 10)  # Normalize ratings to 0-1 scale
    
    # Sort the movies based on combined score
    sorted_indices = genre_filtered_df.index[np.argsort(combined_score)[::-1]]
    
    # Get top N similar movies
    top_movies = df.loc[sorted_indices[:top_n]]
    
    # Return movie titles as a list
    return top_movies['movie_title'].tolist()


In [141]:
# Example usage
actors_input = "Bradley Cooper,Keir O'Donnell"
genre_input = 'Thriller'

recommendations = get_recommendations(actors_input, genre_input)
print(recommendations)

['American Sniper', 'The Words', '10 Cloverfield Lane', 'High Noon', 'The Night Visitor', 'Fargo', 'The Dark Knight', 'Case 39', 'Inception', 'Daredevil']
