# Simple Content-Based Recommendation

In [22]:
# Import Required Libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display, clear_output

#### 1. Dataset

    Dataset Use : Top 250 Indian movies IMDB 

Link to download - *https://www.kaggle.com/datasets/khushipitroda/top-250-indian-movies-imdb*

In [23]:
# Load and display dataset
file_path = "Dataset/Indian_movies.csv"
movies = pd.read_csv(file_path)
movies.head()

Unnamed: 0.1,Unnamed: 0,Rank,Movie Names,Links,Rating,Year,Duration_of_movie,Genere,Description
0,0,1,Ramayana: The Legend of Prince Rama,https://www.imdb.com//title/tt0259534/?ref_=fe...,9.2,1993,PG,"Animation,Action,Adventure,Back to top",An anime adaptation of the Hindu epic the Rama...
1,1,2,Rocketry: The Nambi Effect,https://www.imdb.com//title/tt9263550/?ref_=fe...,8.7,2022,2h 37m,"Biography,Drama,Back to top",Based on the life of Indian Space Research Org...
2,2,3,Nayakan,https://www.imdb.com//title/tt0093603/?ref_=fe...,8.6,1987,Not Rated,"Crime,Drama,Back to top",A common man's struggles against a corrupt pol...
3,3,4,Gol Maal,https://www.imdb.com//title/tt0079221/?ref_=fe...,8.5,1979,Not Rated,"Comedy,Romance,Back to top",A man's simple lie to secure his job escalates...
4,4,5,Anbe Sivam,https://www.imdb.com//title/tt0367495/?ref_=fe...,8.6,2003,Not Rated,"Adventure,Comedy,Drama,Back to top","Two men, one young and arrogant, the other dam..."


#### Data Cleaning

In [24]:
# Select relevant columns
movies = movies[['Movie Names', 'Rating', 'Year', 'Genere', 'Description']]

In [25]:
# Check with the with missing values
movies.isnull().sum()

Movie Names    0
Rating         0
Year           0
Genere         0
Description    0
dtype: int64

In [28]:
# Check for duplicate entries and display count
duplicate_count = movies.duplicated(subset=['Movie Names']).sum()
print(f"Number of duplicate movie entries: {duplicate_count}")

Number of duplicate movie entries: 2


In [29]:
# Remove duplicates based on 'Movie Names'
movies = movies.drop_duplicates(subset=['Movie Names'])

In [30]:
# Remove ",Back to top" from the 'Genere' column
movies['Genere'] = movies['Genere'].str.replace(',Back to top', '', regex=False)

#### 2. Approach

In [31]:
# Create a combined features column for better similarity matching
movies['combined_features'] = movies['Genere'] + ' ' + movies['Year'].astype(str) + ' ' + movies['Rating'].astype(str) + ' ' + movies['Description']

In [32]:
# Function to recommend movies based on a text query
def recommend_movies(query, movies_df, top_n=5):
    """
    Parameters:
    - query (str): The user's movie preference.
    - movies_df (DataFrame): The dataframe with movie names and combined features.
    - top_n (int): Number of recommendations to return.
    
    Returns:
    - DataFrame: Top N recommended movies with their names, genres, descriptions, and similarity scores.
    """
    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(movies_df['combined_features'])
    query_vector = vectorizer.transform([query])

    # Calculate cosine similarity between query and movie combined features
    cosine_sim = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get top N similar movies
    top_indices = cosine_sim.argsort()[-top_n:][::-1]
    top_scores = cosine_sim[top_indices]

    # Create a DataFrame for recommendations with similarity scores
    recommendations = movies_df.iloc[top_indices].copy()
    recommendations['similarity_score'] = top_scores

    return recommendations[['Movie Names', 'Genere', 'Year', 'Rating', 'Description', 'similarity_score']].reset_index(drop=True)


#### 3. Output

The while loop continuously prompts the user to enter a movie preference and returns recommendations based on their input. After displaying the results, it asks if the user wants to modify their query. If the user responds with "yes" or "y," the loop clears the output and allows them to enter a new query. If they respond with "no" or "n," the loop exits, ending the process.

In [33]:
while True:
    # Ask user for input
    query = input("Enter your movie preference: ")
    
    # Get and display recommendations
    recommendations_df = recommend_movies(query, movies)
    clear_output(wait=True)
    print("Query:", query)
    print("Top Movie Recommendations:")
    display(recommendations_df)
    
    # Ask if user wants to edit the query
    edit_query = input("Do you want to edit the query? (yes/y or no/n): ").strip().lower()
    if edit_query not in ['yes', 'y']:
        break

Query: Can I have thriller with some drama for family
Top Movie Recommendations:


Unnamed: 0,Movie Names,Genere,Year,Rating,Description,similarity_score
0,Drishyam 2,"Crime,Drama,Thriller",2021,8.4,A gripping tale of an investigation and a fami...,0.400691
1,Drishyam,"Crime,Drama,Thriller",2013,8.3,A man goes to extreme lengths to save his fami...,0.393829
2,Joseph,"Crime,Drama,Thriller",2018,8.0,The story develops through the life of four re...,0.290516
3,Papanasam,"Crime,Drama,Thriller",2015,8.4,Desperate measures are taken by a man who trie...,0.247727
4,Nil Battey Sannata,"Drama,Family",2015,8.2,A story about a single woman (a Mom) and dream...,0.221899
