In [22]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

In [27]:
# Load credits data

df = pd.read_csv("tmdb_5000_credits.csv")

# Preview the data

#df.drop(["crew"],axis = 1, inplace = True)
df.head()


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


Extract the top 3 actor names from a cast list (which is a string version of a list of dictionaries) And getting name of director from crew list

In [25]:
def get_top_cast(cast_str):
    try:
        cast = ast.literal_eval(cast_str)
        return ' '.join([actor['name'] for actor in cast[:3]])  # top 3 actors
    except:
        return ''

def get_director(crew_str):
    try:
        crew = ast.literal_eval(crew_str)
        for member in crew:
            if member['job'] == 'Director':
                return member['name']
        return ''
    except:
        return ''



In [29]:
# Extract top cast and director
df['top_cast'] = df['cast'].apply(get_top_cast)
df['director'] = df['crew'].apply(get_director)


In [30]:
# Create a new combined feature
df['tags'] = df['top_cast'] + ' ' + df['director']

# Vectorize the text
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df['tags']).toarray()

# Compute cosine similarity
similarity = cosine_similarity(vectors)

In [31]:
print(similarity)

[[1.         0.         0.13363062 ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.13363062 0.         1.         ... 0.         0.26726124 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.26726124 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [32]:
# Recommendation function
def recommend(movie_title):
    if movie_title not in df['title'].values:
        return "Movie not found!"
    
    idx = df[df['title'] == movie_title].index[0]
    distances = list(enumerate(similarity[idx]))
    sorted_movies = sorted(distances, key=lambda x: x[1], reverse=True)[1:10]  # Top 5
    print(f"\nMovies similar to '{movie_title}':")
    for i in sorted_movies:
        print(df.iloc[i[0]]['title'])


In [35]:
recommend("Toy Story")


Movies similar to 'Toy Story':
Toy Story 2
Toy Story 3
That Thing You Do!
Larry Crowne
The Santa Clause
Animal House
Wild Hogs
Joe Somebody
Saving Mr. Banks
