In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from fuzzywuzzy import process

# Load datasets
movies = pd.read_csv("data/ml-latest-small/movies.csv")
ratings = pd.read_csv("data/ml-latest-small/ratings.csv")

# Combine relevant features
movies['combined_features'] = movies['title'].fillna('') + ' ' + movies['genres'].fillna('')

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Title to index mapping
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Closest match finder
def get_closest_title(input_title):
    match = process.extractOne(input_title, movies['title'])
    return match[0] if match and match[1] > 60 else None

# Content-based recommender
def recommend_content_based(title, num_recommendations=10):
    idx = indices.get(title)
    if idx is None:
        print("❌ Movie not found.")
        return
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies[['title', 'genres']].iloc[movie_indices]

# Input + Genre-based filtering
def recommend_by_input_and_genre(input_title, genre):
    matched_title = get_closest_title(input_title)
    if not matched_title:
        print("❌ No close match found. Try another title.")
        return
    print(f"✅ Showing recommendations for: {matched_title} | Genre: {genre}")
    recommendations = recommend_content_based(matched_title, 20)
    if recommendations is not None:
        filtered = recommendations[recommendations['genres'].str.contains(genre, case=False, na=False)]
        if filtered.empty:
            print("⚠️ No recommendations match that genre. Showing all:")
            print(recommendations.head(5))
        else:
            print(filtered.head(5))

# Trending movies (by popularity)
def get_trending_movies(n=5):
    trending_ids = ratings['movieId'].value_counts().head(n).index
    trending_movies = movies[movies['movieId'].isin(trending_ids)][['title', 'genres']]
    print("🔥 Trending Movies:")
    print(trending_movies.reset_index(drop=True))

# 🔄 Example calls:
# get_trending_movies()
# recommend_by_input_and_genre("Batman", "Action")


In [2]:
# 🔍 Interactive input for movie + genre
movie_input = input("Enter a movie title you like: ")
genre_input = input("Enter a genre you want (or press Enter to skip): ")

# Get closest match
matched_title = get_closest_title(movie_input)

if matched_title:
    print(f"\n✅ Closest match found: {matched_title}")
    print(f"📽️ Showing recommendations in genre: {genre_input if genre_input else 'Any'}")
    print("=" * 60)
    recommend_by_input_and_genre(matched_title, genre_input if genre_input else None)
else:
    print("\n❌ No close match found. Try another movie title.")



✅ Closest match found: Avengers, The (1998)
📽️ Showing recommendations in genre: action
✅ Showing recommendations for: Avengers, The (1998) | Genre: action
                                       title                        genres
7693                    Avengers, The (2012)  Action|Adventure|Sci-Fi|IMAX
8693  Avengers: Infinity War - Part I (2018)       Action|Adventure|Sci-Fi
9153                  Masked Avengers (1981)                        Action
9488              Ultimate Avengers 2 (2006)       Action|Animation|Sci-Fi
8686          Avengers: Age of Ultron (2015)       Action|Adventure|Sci-Fi


In [3]:
import pandas as pd

# Load data
movies = pd.read_csv('data/ml-latest-small/movies.csv')
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')

# Merge movies and ratings
movie_data = pd.merge(movies, ratings, on='movieId')

# Function to get Top-N movies in a genre
def get_top_n_by_genre(genre, n=10, min_ratings=50):
    # Filter movies of the given genre
    genre_movies = movie_data[movie_data['genres'].str.contains(genre, case=False, na=False)]

    # Group by title and calculate average rating and count
    grouped = genre_movies.groupby('title').agg(
        average_rating=('rating', 'mean'),
        rating_count=('rating', 'count')
    ).reset_index()

    # Filter by minimum ratings to avoid niche flukes
    filtered = grouped[grouped['rating_count'] >= min_ratings]

    # Sort by average rating and vote count
    top_n = filtered.sort_values(['average_rating', 'rating_count'], ascending=False).head(n)

    return top_n

# Example usage
genre_input = input("Enter a genre (e.g., Comedy, Action, Drama): ")
n = int(input("How many top movies do you want? "))
top_movies = get_top_n_by_genre(genre_input, n)

if not top_movies.empty:
    print(f"\n🎯 Top {n} movies in '{genre_input}' genre:")
    print(top_movies[['title', 'average_rating', 'rating_count']])
else:
    print("❌ No movies found for this genre or not enough ratings.")



🎯 Top 10 movies in 'action' genre:
                                                  title  average_rating  \
551                                   Fight Club (1999)        4.272936   
374                             Dark Knight, The (2008)        4.238255   
1255                         Princess Bride, The (1987)        4.232394   
1505          Star Wars: Episode IV - A New Hope (1977)        4.231076   
92                                Apocalypse Now (1979)        4.219626   
1506  Star Wars: Episode V - The Empire Strikes Back...        4.215640   
1279  Raiders of the Lost Ark (Indiana Jones and the...        4.207500   
1038                                 Matrix, The (1999)        4.192446   
1151                          North by Northwest (1959)        4.184211   
311                 City of God (Cidade de Deus) (2002)        4.146667   

      rating_count  
551            218  
374            149  
1255           142  
1505           251  
92             107  
1506        

In [4]:
# Merge movies and ratings
movie_data = pd.merge(movies, ratings, on='movieId')

# Calculate average rating and count of ratings
movie_stats = movie_data.groupby('title').agg({
    'rating': ['mean', 'count']
}).reset_index()

# Rename columns for clarity
movie_stats.columns = ['title', 'avg_rating', 'rating_count']

# Filter: Only show movies with at least 100 ratings
filtered_stats = movie_stats[movie_stats['rating_count'] >= 100]

# Get Top 10
top_n_movies = filtered_stats.sort_values(by='avg_rating', ascending=False).head(10)

print("🎬 Top 10 Highest Rated Movies (100+ ratings):")
print(top_n_movies[['title', 'avg_rating', 'rating_count']])


🎬 Top 10 Highest Rated Movies (100+ ratings):
                                 title  avg_rating  rating_count
7593  Shawshank Redemption, The (1994)    4.429022           317
3499             Godfather, The (1972)    4.289062           192
3011                 Fight Club (1999)    4.272936           218
3500    Godfather: Part II, The (1974)    4.259690           129
2334              Departed, The (2006)    4.252336           107
3564                 Goodfellas (1990)    4.250000           126
1593                 Casablanca (1942)    4.240000           100
2163           Dark Knight, The (2008)    4.238255           149
9119        Usual Suspects, The (1995)    4.237745           204
6808        Princess Bride, The (1987)    4.232394           142


In [5]:
import pandas as pd
import re

# Merge movies and ratings
movie_data = pd.merge(movies, ratings, on='movieId')

# --- Extract release year from title ---
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    return int(match.group(1)) if match else None

movie_data['year'] = movie_data['title'].apply(extract_year)

# Drop rows where year is missing
movie_data = movie_data.dropna(subset=['year'])

# Filter for movies from year 2000 onwards
filtered_data = movie_data[movie_data['year'] >= 2000]

# Group and compute stats
movie_stats = filtered_data.groupby('title').agg(
    avg_rating=('rating', 'mean'),
    rating_count=('rating', 'count'),
    year=('year', 'first')
).reset_index()

# Filter movies with at least 100 ratings
filtered_stats = movie_stats[movie_stats['rating_count'] >= 100]

# Sort by average rating
top_n_movies = filtered_stats.sort_values(by='avg_rating', ascending=False).head(10)

print("🎬 Top 10 Highest Rated Movies (2000 onward, min 100 ratings):")
print(top_n_movies[['title', 'avg_rating', 'rating_count', 'year']])


🎬 Top 10 Highest Rated Movies (2000 onward, min 100 ratings):
                                                  title  avg_rating  \
1111                               Departed, The (2006)    4.252336   
1026                            Dark Knight, The (2008)    4.238255   
204   Amelie (Fabuleux destin d'Amélie Poulain, Le) ...    4.183333   
1342       Eternal Sunshine of the Spotless Mind (2004)    4.160305   
2725                                     Memento (2000)    4.122642   
2547  Lord of the Rings: The Return of the King, The...    4.118919   
2546  Lord of the Rings: The Fellowship of the Ring,...    4.106061   
2096                                   Inception (2010)    4.066434   
4510                                      WALL·E (2008)    4.057692   
2548      Lord of the Rings: The Two Towers, The (2002)    4.021277   

      rating_count    year  
1111           107  2006.0  
1026           149  2008.0  
204            120  2001.0  
1342           131  2004.0  
2725       