In [2]:
import numpy as np
import pandas as pd
import ast
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load the datasets
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

# Merge movies and credits on the 'title' column
movies = movies.merge(credits, on='title')

# Keep only the important columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)

# Remove duplicates if any
movies.drop_duplicates(inplace=True)

# Convert stringified lists into actual lists using 'ast.literal_eval'
def convert(text):
    return [i['name'] for i in ast.literal_eval(text)]

# Apply the conversion function to the 'genres' and 'keywords' columns
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Handle 'cast' column by keeping only the top 3 cast members
def convert_cast(text):
    return [i['name'] for i in ast.literal_eval(text)[:3]]

movies['cast'] = movies['cast'].apply(convert_cast)

# Extract the director's name from the 'crew' column
def fetch_director(text):
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            return [i['name']]
    return []

movies['crew'] = movies['crew'].apply(fetch_director)

# Convert 'overview' column into a list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Remove spaces from names in 'cast', 'crew', 'genres', and 'keywords'
def remove_space(L):
    return [i.replace(" ", "") for i in L]

movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

# Concatenate all relevant columns into a single 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new dataframe with 'movie_id', 'title', and 'tags' columns
new_df = movies[['movie_id', 'title', 'tags']]

# Convert the list of tags into a single string for each movie
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Convert all tags to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

# Initialize the Porter Stemmer for stemming
ps = PorterStemmer()

# Define a function to apply stemming to the 'tags' column
def stems(text):
    return " ".join([ps.stem(word) for word in text.split()])

# Apply stemming to the 'tags' column
new_df['tags'] = new_df['tags'].apply(stems)

# Vectorize the 'tags' column using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new_df['tags']).toarray()

# Compute the cosine similarity between the vectors
similarity = cosine_similarity(vector)

# Function to recommend movies based on similarity
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stems)


In [3]:
# Example usage
recommend('Spider-Man 2')

# Save the data and model using pickle
pickle.dump(new_df, open('artifacts/movie_list.pkl', 'wb'))
pickle.dump(similarity, open('artifacts/similarity.pkl', 'wb'))

Spider-Man 3
Spider-Man
The Amazing Spider-Man
Iron Man 2
Superman


██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [1]:
import requests
import json

# List of API keys and their usage count
api_keys = ['7058f3e1', '38bad860', 'fd237622', 'e189e68f', 'b4d0b532', '4bede092', 'a9e2d1c']  # Add more keys as needed
api_key_usage = {key: 0 for key in api_keys}  # Dictionary to track API key usage
request_limit = 1000  # Set to OMDb's free daily limit

# Function to fetch movie details from the OMDB API using multiple keys
def movie_detail(movie_name):
    global api_keys, api_key_usage
    
    # Loop through each API key until one works or limit is hit
    for api_key in api_keys:
        # Check if the current API key has reached its limit
        if api_key_usage[api_key] < request_limit:
            url = f"http://www.omdbapi.com/?t={movie_name}&apikey={api_key}"
            
            try:
                response = requests.get(url)
                
                # If request is successful, check the status code
                if response.status_code == 200:
                    movie_data = response.json()
                    
                    # Increment the usage count for the current API key
                    api_key_usage[api_key] += 1
                    
                    # Check if the 'Title' key exists in the response data
                    if 'Title' in movie_data:
                        return movie_data
                    else:
                        # Log if there is a problem with the API response
                        print(f"API key {api_key} returned no 'Title' in the response. Response: {movie_data}")
                else:
                    print(f"API key {api_key} failed with status code: {response.status_code}")
                    
            except requests.exceptions.RequestException as e:
                print(f"Error fetching data with API key {api_key}: {e}")
    
    # If no keys worked or all hit the limit, return default values
    return {
        'Title': 'N/A',
        'Year': 'N/A',
        'Runtime': 'N/A',
        'imdbRating': 'N/A',
        'Poster': 'N/A'
    }

# Example usage
movie_data = movie_detail("Inception")
print(movie_data)


{'Title': 'Inception', 'Year': '2010', 'Rated': 'PG-13', 'Released': '16 Jul 2010', 'Runtime': '148 min', 'Genre': 'Action, Adventure, Sci-Fi', 'Director': 'Christopher Nolan', 'Writer': 'Christopher Nolan', 'Actors': 'Leonardo DiCaprio, Joseph Gordon-Levitt, Elliot Page', 'Plot': 'A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O., but his tragic past may doom the project and his team to disaster.', 'Language': 'English, Japanese, French', 'Country': 'United States, United Kingdom', 'Awards': 'Won 4 Oscars. 159 wins & 220 nominations total', 'Poster': 'https://m.media-amazon.com/images/M/MV5BMjAxMzY3NjcxNF5BMl5BanBnXkFtZTcwNTI5OTM0Mw@@._V1_SX300.jpg', 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.8/10'}, {'Source': 'Rotten Tomatoes', 'Value': '87%'}, {'Source': 'Metacritic', 'Value': '74/100'}], 'Metascore': '74', 'imdbRating': '8.8', 'imdbVotes': '2,592,712', 'imdbID'

In [1]:
import requests

# Function to fetch movie details from the OMDB API
api_keys = ['7058f3e1', '38bad860', 'fd237622', 'e189e68f', 'b4d0b532', '4bede092', 'a9e2d1c']  # Add more keys as needed

# Function to fetch movie details from the OMDB API using multiple keys
def movie_detail(movie_name):
    global api_keys
    # Loop through each API key until one works
    for api_key in api_keys:
        url = f"http://www.omdbapi.com/?t={movie_name}&apikey={api_key}"
        try:
            response = requests.get(url)
            # If request is successful, check the status code
            if response.status_code == 200:
                movie_data = response.json()
                # Check if the 'Title' key exists in the response data
                if 'Title' in movie_data:
                    print(f"API key {api_key}")
                    return movie_data
                else:
                    # Log if there is a problem with the API response
                    print(f"API key {api_key} returned no 'Title' in the response. Response: {movie_data}")
            else:
                print(f"API key {api_key} failed with status code: {response.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data with API key {api_key}: {e}")
    # If no keys worked, return default values
    return {
        'Title': 'N/A',
        'Year': 'N/A',
        'Runtime': 'N/A',
        'imdbRating': 'N/A',
        'Poster': 'N/A'
    }

In [2]:
# Example usage
movie_data = movie_detail("Inception")
print(movie_data)

API key 7058f3e1 failed with status code: 401
API key 38bad860 failed with status code: 401
API key fd237622 failed with status code: 401
API key e189e68f
{'Title': 'Inception', 'Year': '2010', 'Rated': 'PG-13', 'Released': '16 Jul 2010', 'Runtime': '148 min', 'Genre': 'Action, Adventure, Sci-Fi', 'Director': 'Christopher Nolan', 'Writer': 'Christopher Nolan', 'Actors': 'Leonardo DiCaprio, Joseph Gordon-Levitt, Elliot Page', 'Plot': 'A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O., but his tragic past may doom the project and his team to disaster.', 'Language': 'English, Japanese, French', 'Country': 'United States, United Kingdom', 'Awards': 'Won 4 Oscars. 159 wins & 220 nominations total', 'Poster': 'https://m.media-amazon.com/images/M/MV5BMjAxMzY3NjcxNF5BMl5BanBnXkFtZTcwNTI5OTM0Mw@@._V1_SX300.jpg', 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.8/10'}, {'Source': '

In [3]:
import requests

# List of OMDB API keys
api_keys = ['7058f3e1', '38bad860', 'fd237622', 'e189e68f', 'b4d0b532', '4bede092', 'a9e2d1c']

# Function to check which API key is valid (status code 200)
def check_api_key():
    test_movie = 'Inception'  # Using a test movie to check API key functionality
    for api_key in api_keys:
        url = f"http://www.omdbapi.com/?t={test_movie}&apikey={api_key}"
        try:
            response = requests.get(url)
            if response.status_code == 200:
                movie_data = response.json()
                # Ensure the API key returns valid movie data
                if 'Title' in movie_data:
                    print(f"Valid API key found: {api_key}")
                    return api_key
        except requests.exceptions.RequestException as e:
            print(f"Error with API key {api_key}: {e}")
    return None  # If no valid API key is found

# Function to fetch movie details using the first valid API key
def movie_detail(movie_name):
    api_key = check_api_key()
    if api_key:
        url = f"http://www.omdbapi.com/?t={movie_name}&apikey={api_key}"
        try:
            response = requests.get(url)
            if response.status_code == 200:
                movie_data = response.json()
                if 'Title' in movie_data:
                    return movie_data
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data with API key {api_key}: {e}")
    
    # Return default values if no valid response is obtained
    return {
        'Title': 'N/A',
        'Year': 'N/A',
        'Runtime': 'N/A',
        'imdbRating': 'N/A',
        'Poster': 'N/A'
    }

# Example usage:
movie_name = 'The Matrix'
movie_data = movie_detail(movie_name)
print(movie_data)


Valid API key found: e189e68f
{'Title': 'The Matrix', 'Year': '1999', 'Rated': 'R', 'Released': '31 Mar 1999', 'Runtime': '136 min', 'Genre': 'Action, Sci-Fi', 'Director': 'Lana Wachowski, Lilly Wachowski', 'Writer': 'Lilly Wachowski, Lana Wachowski', 'Actors': 'Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss', 'Plot': 'When a beautiful stranger leads computer hacker Neo to a forbidding underworld, he discovers the shocking truth--the life he knows is the elaborate deception of an evil cyber-intelligence.', 'Language': 'English', 'Country': 'United States, Australia', 'Awards': 'Won 4 Oscars. 42 wins & 52 nominations total', 'Poster': 'https://m.media-amazon.com/images/M/MV5BNzQzOTk3OTAtNDQ0Zi00ZTVkLWI0MTEtMDllZjNkYzNjNTc4L2ltYWdlXkEyXkFqcGdeQXVyNjU0OTQ0OTY@._V1_SX300.jpg', 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.7/10'}, {'Source': 'Rotten Tomatoes', 'Value': '83%'}, {'Source': 'Metacritic', 'Value': '73/100'}], 'Metascore': '73', 'imdbRating': '8.7', 'imdbVote

In [1]:
import gdown
import pickle
import os

# Function to download a file from Google Drive
def download_file(url, output_path):
    gdown.download(url, output_path, quiet=False)

# Paths for the pickle files
artifacts_folder = 'artifacts'
if not os.path.exists(artifacts_folder):
    os.makedirs(artifacts_folder)

movie_list_path = os.path.join(artifacts_folder, 'movie_list.pkl')
similarity_path = os.path.join(artifacts_folder, 'similarity.pkl')

# URLs for the pickle files
url_movie_list = 'https://drive.google.com/uc?export=download&id=1bzmDYhHCOCI0dRLF72-6rt3DstLGbXT7'
url_similarity = 'https://drive.google.com/uc?id=1xrVHcbqtvdX5J435kvJ1FkQP8bT_Hj0e'

# Check if movie_list.pkl exists, if not, download it
if not os.path.exists(movie_list_path):
    print(f"{movie_list_path} not found. Downloading...")
    download_file(url_movie_list, movie_list_path)
else:
    print(f"{movie_list_path} found. No need to download.")

# Load movie_list
with open(movie_list_path, 'rb') as f:
    movie_list = pickle.load(f)

# Check if similarity.pkl exists, if not, download it
if not os.path.exists(similarity_path):
    print(f"{similarity_path} not found. Downloading...")
    download_file(url_similarity, similarity_path)
else:
    print(f"{similarity_path} found. No need to download.")

# Load similarity
with open(similarity_path, 'rb') as f:
    similarity = pickle.load(f)

print("Files checked, downloaded if necessary, and loaded successfully.")


artifacts\movie_list.pkl found. No need to download.
artifacts\similarity.pkl found. No need to download.
Files checked, downloaded if necessary, and loaded successfully.
