In [None]:
# Install compatible numpy and scikit-surprise without changing other libraries too much
!pip install numpy==1.23.5
!pip install --prefer-binary scikit-surprise


# Download and install precompiled .whl file directly
!pip install https://github.com/NicolasHug/Surprise/releases/download/v1.1.3/scikit_surprise-1.1.3-cp310-cp310-manylinux_2_17_x86_64.whl
!pip install rapidfuzz

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2463297 sha256=8f104df42d4fcc48eeb1c7e54a0049097d185eb34f1426913a69b00114acd9a7
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4
[31mERROR: scikit_surprise-1.1.3-cp310-cp310-manylinux_2_17_x86_64.whl is not a supported wheel on this platform.[0m[31m
[0mCollecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_

In [None]:
# Upload files manually or mount drive to access datasets
from google.colab import files
uploaded = files.upload()

# Mount Google Drive
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise.model_selection import train_test_split
import pickle
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from surprise import Dataset, Reader, SVD
from rapidfuzz import process

drive.mount('/content/drive')

# Load datasets
movies = pd.read_csv('/content/drive/MyDrive/test/movie_metadata.csv')
data = pd.read_csv('/content/drive/MyDrive/test/data.csv')

# Standardize movie titles
movies['movie_title'] = movies['movie_title'].str.strip()
if 'movieId' not in movies.columns:
    movies['movieId'] = range(1, len(movies) + 1)

# Load and process reviews
with open('/content/drive/MyDrive/test/reviews.txt', 'r', encoding='utf-8') as file:
    reviews = file.readlines()

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

review_data = []
user_id = 1
for line in reviews:
    parts = line.strip().split('\t')
    if len(parts) >= 2:
        movie_title, review_text = parts[1].strip(), ' '.join(parts[2:])
        sentiment_score = sia.polarity_scores(review_text)['compound']
        rating = max(1, min(5, round((sentiment_score + 1) * 2 + 1)))
        review_data.append([user_id, movie_title, rating])
        user_id += 1

reviews_df = pd.DataFrame(review_data, columns=['userId', 'movie_title', 'rating'])
reviews_df = reviews_df.merge(movies[['movie_title', 'movieId']], on='movie_title', how='inner')
data = pd.concat([data, reviews_df[['userId', 'movieId', 'rating']]], ignore_index=True)

# Ensure required columns exist
if 'plot_keywords' not in movies.columns or 'genres' not in movies.columns:
    raise KeyError("The 'plot_keywords' or 'genres' column is missing from the movie dataset.")

# Content-Based Filtering
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(movies['plot_keywords'].fillna(""))
cosine_sim = cosine_similarity(tfidf_matrix)
pickle.dump(cosine_sim, open("cosine_sim.pkl", "wb"))

# Collaborative Filtering
reader = Reader(rating_scale=(1, 5))
data_surprise = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data_surprise, test_size=0.2, random_state=42)
model = SVD(n_factors=50, random_state=42)
model.fit(trainset)
pickle.dump(model, open("collab_model.pkl", "wb"))

def get_content_recommendations(title=None, genre=None, top_n=10):
    if title:
        title = title.strip()
        matched_title, score, _ = process.extractOne(title, movies['movie_title'])
        if score < 60:
            return f"No close match found for '{title}'. Try a different title."
        idx = movies[movies['movie_title'] == matched_title].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
        movie_indices = [i[0] for i in sim_scores]
        return [f"Matched with: {matched_title} (Score: {score})"] + movies.iloc[movie_indices]['movie_title'].tolist()

    elif genre:
        genre = genre.strip().lower()
        genre_movies = movies[movies['genres'].str.lower().str.contains(genre, na=False)]
        if genre_movies.empty:
            return f"No movies found for genre '{genre}'."
        return genre_movies['movie_title'].head(top_n).tolist()

    return "Please provide a movie title or genre."

def get_collab_recommendations(user_id, model, movies, data, top_n=10):
    user_movies = data[data['userId'] == user_id]['movieId'].unique()
    all_movies = data['movieId'].unique()
    unseen_movies = list(set(all_movies) - set(user_movies))
    if not unseen_movies:
        return "No unseen movies for this user."
    predictions = [model.predict(user_id, movie_id) for movie_id in unseen_movies]
    predictions.sort(key=lambda x: x.est, reverse=True)
    top_movie_ids = [pred.iid for pred in predictions[:top_n]]
    return movies[movies['movieId'].isin(top_movie_ids)]['movie_title'].tolist()

def recommend_movies():
    user_input = input("Enter a movie title, genre, or user ID: ").strip()

    if user_input.isdigit():
        user_id = int(user_input)
        recommendations = get_collab_recommendations(user_id, model, movies, data)
    elif user_input:
        recommendations = get_content_recommendations(title=user_input)
        if isinstance(recommendations, str):
            recommendations = get_content_recommendations(genre=user_input)
    else:
        recommendations = "Invalid input."

    print("\nTop 10 Recommended Movies:")
    print(recommendations)

# Run the recommendation system in a loop
while True:
    recommend_movies()
    while True:
        choice = input("\nWould you like to search again? (yes/no): ").strip().lower()
        if choice in ['yes', 'y']:
            break
        elif choice in ['no', 'n']:
            print("Thank you for using CineSync. Goodbye!")
            exit()
        else:
            print("Invalid input. Please type 'yes' or 'no'.")


Saving data.csv to data.csv
Saving movie_metadata.csv to movie_metadata.csv
Saving movie_score.csv to movie_score.csv
Saving reviews(1).txt to reviews(1).txt
Mounted at /content/drive


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Enter a movie title, genre, or user ID: conjuring

Top 10 Recommended Movies:
['Matched with: The Conjuring 2 (Score: 80.0)', 'The Da Vinci Code', 'We Have Your Husband', 'The Conjuring', 'Mulan', 'Blood Done Sign My Name', 'Silmido', 'Black Mass', 'World Trade Center', 'Rosewater', 'Million Dollar Arm']

Would you like to search again? (yes/no): yutninam
Invalid input. Please type 'yes' or 'no'.

Would you like to search again? (yes/no): yes
Enter a movie title, genre, or user ID: 100

Top 10 Recommended Movies:
[]

Would you like to search again? (yes/no): yes
Enter a movie title, genre, or user ID: aslkfahgjan,ms;o'gjisl;akgm/,

Top 10 Recommended Movies:
['Matched with: Cars (Score: 60.00000000000001)', 'Big Trouble in Little China', 'Transformers: Age of Extinction', 'The Dukes of Hazzard', 'Sorcerer', 'A Better Life', 'Joy Ride', 'Out of the Blue', 'Terminator 3: Rise of the Machines', 'The Woman Chaser', 'The R.M.']

Would you like to search again? (yes/no): yes
Enter a movie ti