In [93]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load the data
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge movies and credits data
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)

# Convert JSON fields to list of strings
def convert(text):
    if isinstance(text, str):
        try:
            return [i['name'] for i in ast.literal_eval(text)]
        except ValueError:
            return text  # Return as is if conversion fails
    return text  # Return as is if it's already a list

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: convert(x)[:3])
movies['crew'] = movies['crew'].apply(lambda x: [i['name'] for i in ast.literal_eval(x) if i['job'] == 'Director'])

# Combine relevant features into a single 'tags' column
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new = movies[['movie_id', 'title', 'tags']]

# Convert 'tags' to a single string
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

# Vectorize the 'tags' using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags'])

# Apply Truncated SVD for dimensionality reduction
svd = TruncatedSVD(n_components=300)  # Adjust n_components as needed
vector_reduced = svd.fit_transform(vector)

# Calculate cosine similarity on the reduced vectors
similarity = cosine_similarity(vector_reduced)

# Define number of nearest neighbors
k = 15

# Create lists to store top k neighbors and their distances
neighbors = []
distances = []

for i in range(len(new)):
    # Get the indices of the top k similar movies
    top_k_indices = np.argsort(similarity[i])[::-1][1:k+1]
    neighbors.append(top_k_indices.tolist())
    distances.append(similarity[i][top_k_indices].tolist())

# Combine neighbors and distances into a single data structure
similarity_data = {
    'neighbors': neighbors,
    'distances': distances
}

# Recommendation function
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    top_indices = similarity_data['neighbors'][index]
    top_movies = new.iloc[top_indices]
    for title in top_movies['title']:
        print(title)

# Example usage
recommend('Spider-Man 3')

# Save the new data and similarity data
pickle.dump(new, open('movie_list.pkl', 'wb'))
pickle.dump(similarity_data, open('similarity.pkl', 'wb'))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['tags'] = new['tags'].apply(lambda x: " ".join(x))


Spider-Man 2
The Amazing Spider-Man 2
Spider-Man
Hancock
Iron Man 2
In the Name of the King: A Dungeon Siege Tale
The Velocity of Gary
Two Lovers
The Amazing Spider-Man
Kites
Deadpool
Love in the Time of Cholera
Memoirs of an Invisible Man
Meet Joe Black
Megamind
