In [7]:
import pandas as pd
import numpy as np

# Load data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# Merge movies and ratings data
movie_ratings = pd.merge(ratings, movies, on='movieId')

# User-item matrix
user_item_matrix = movie_ratings.pivot_table(index='userId', columns='movieId', values='rating')

# User-based collaborative filtering
def user_based_cf(user_id, item_id):
    user_ratings = user_item_matrix.loc[user_id].dropna()
    if user_ratings.empty:
        return 0
    similar_users = user_item_matrix.corrwith(user_ratings, axis=1)
    similar_users = similar_users.dropna().sort_values(ascending=False)
    if similar_users.empty:
        return 0
    user_item_matrix_filtered = user_item_matrix.loc[similar_users.index]
    weighted_sum = (user_item_matrix_filtered.loc[:, item_id] * similar_users).sum()
    sum_of_weights = similar_users.abs().sum()
    if sum_of_weights == 0:
        return 0
    else:
        return weighted_sum / sum_of_weights



In [8]:

# Example usage
user_id = 1
item_id = 1# Import libraries
import pandas as pd
from surprise import Reader, Dataset, SVD

# Load data from CSV files
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")


# Reader object for Surprise library
reader = Reader(rating_scale=(1, 5))

# Create a Surprise dataset from the ratings DataFrame
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Train a recommendation model (SVD in this example)
algo = SVD()
trainset = data.build_full_trainset()
algo.fit(trainset)

# Get user ID for recommendations (replace with actual user input)
user_id = 10

# Get top 10 movie recommendations for the user
user_ratings = ratings_df[ratings_df['userId'] == user_id]
recommended_movies = [algo.predict(user_id, movieId) for movieId in movies_df['movieId'] 
                      if movieId not in user_ratings['movieId'].tolist()]

recommended_movies.sort(key=lambda x: x.est, reverse=True)
top_10_recommendations = recommended_movies[:10]

# Print recommendations
print("Top 10 Movie Recommendations for User", user_id)
for recommendation in top_10_recommendations:
    movie_title = movies_df[movies_df['movieId'] == recommendation.iid]['title'].values[0]
    print(f"- {movie_title}")





Top 10 Movie Recommendations for User 10
- Harry Potter and the Prisoner of Azkaban (2004)
- North by Northwest (1959)
- It's a Wonderful Life (1946)
- Lawrence of Arabia (1962)
- 12 Angry Men (1957)
- Cinema Paradiso (Nuovo cinema Paradiso) (1989)
- Dogville (2003)
- Legend of Drunken Master, The (Jui kuen II) (1994)
- Doctor Zhivago (1965)
- Dead Poets Society (1989)


In [9]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load data from CSV files
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

# Merge movie genres into a single string
movies_df['genres'] = movies_df['genres'].fillna('')
movies_df['genres'] = movies_df['genres'].apply(lambda x: ' '.join(x.split('|')))

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Function to get recommendations based on content similarity
def get_content_based_recommendations(movie_title, cosine_sim=cosine_sim):
    idx = movies_df[movies_df['title'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies_df['title'].iloc[movie_indices]

# Get user ID for recommendations (replace with actual user input)
user_id = 10

# Get top 10 movie recommendations for the user
user_ratings = ratings_df[ratings_df['userId'] == user_id]
watched_movies = set(user_ratings['movieId'])
recommended_movies = []

for movie_id in watched_movies:
    movie_title = movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
    similar_movies = get_content_based_recommendations(movie_title)
    recommended_movies.extend(similar_movies)

recommended_movies = [movie for movie in recommended_movies if movie not in watched_movies]
top_10_recommendations = recommended_movies[:10]

# Print recommendations
print("Top 10 Content-Based Movie Recommendations for User", user_id)
for recommendation in top_10_recommendations:
    print(f"- {recommendation}")


Top 10 Content-Based Movie Recommendations for User 10
- Sabrina (1995)
- Clueless (1995)
- Two if by Sea (1996)
- French Twist (Gazon maudit) (1995)
- If Lucy Fell (1996)
- Boomerang (1992)
- Pie in the Sky (1996)
- Mallrats (1995)
- Nine Months (1995)
- Forget Paris (1995)


In [10]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load data from CSV files
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

# Merge movie genres into a single string
movies_df['genres'] = movies_df['genres'].fillna('')
movies_df['genres'] = movies_df['genres'].apply(lambda x: ' '.join(x.split('|')))

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Function to get recommendations based on content similarity
def get_content_based_recommendations(movie_id, cosine_sim=cosine_sim):
    idx = movies_df[movies_df['movieId'] == movie_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies_df['title'].iloc[movie_indices]

# Specify movie ID for which recommendations are needed
movie_id = 1

# Get similar movies based on content similarity
similar_movies = get_content_based_recommendations(movie_id)

# Print recommendations
print("Movies Similar to Movie with ID:", movie_id)
for movie_title in similar_movies:
    print("- ", movie_title)


Movies Similar to Movie with ID: 1
-  Antz (1998)
-  Toy Story 2 (1999)
-  Adventures of Rocky and Bullwinkle, The (2000)
-  Emperor's New Groove, The (2000)
-  Monsters, Inc. (2001)
-  Wild, The (2006)
-  Shrek the Third (2007)
-  Tale of Despereaux, The (2008)
-  Asterix and the Vikings (Astérix et les Vikings) (2006)
-  Turbo (2013)


In [11]:
from surprise import SVD, KNNBasic, CoClustering, BaselineOnly
from surprise import Dataset, Reader
import pandas as pd
import numpy as np

# Load data from CSV files
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

# Create Surprise Reader object
reader = Reader(rating_scale=(1, 5))

# Create Surprise Dataset
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Train-test split
trainset = data.build_full_trainset()

# Initialize models
svd = SVD()
knn = KNNBasic()
baseline = BaselineOnly()
co_clustering = CoClustering()

# Train models
svd.fit(trainset)
knn.fit(trainset)
baseline.fit(trainset)
co_clustering.fit(trainset)

# Get the list of movies already watched by User 1
user_id = 1
watched_movies = set(ratings_df[ratings_df['userId'] == user_id]['movieId'])

# Generate recommendations excluding watched movies
all_movie_ids = set(movies_df['movieId'])
unwatched_movies = list(all_movie_ids - watched_movies)

# Predict ratings for unwatched movies using each model
predictions = []
for model in [svd, knn, baseline, co_clustering]:
    model_predictions = [(user_id, movie_id, model.predict(user_id, movie_id).est) for movie_id in unwatched_movies]
    predictions.extend(model_predictions)

# Aggregate predictions
combined_preds = {}
for user_id, movie_id, est in predictions:
    if movie_id not in combined_preds:
        combined_preds[movie_id] = [est]
    else:
        combined_preds[movie_id].append(est)

# Take the average of predictions
for movie_id in combined_preds:
    combined_preds[movie_id] = np.mean(combined_preds[movie_id])

# Sort recommendations by the average estimated rating
sorted_recommendations = sorted(combined_preds.items(), key=lambda x: x[1], reverse=True)

# Get top 10 recommendations
top_10_recommendations = sorted_recommendations[:10]

# Print recommendations
print(f"Top 10 Movie Recommendations for User {user_id}:")
for movie_id, _ in top_10_recommendations:
    movie_title = movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
    print(f"- {movie_title}")


Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Top 10 Movie Recommendations for User 1:
- Shawshank Redemption, The (1994)
- Godfather, The (1972)
- Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)
- Godfather: Part II, The (1974)
- Lawrence of Arabia (1962)
- Streetcar Named Desire, A (1951)
- Departed, The (2006)
- Three Billboards Outside Ebbing, Missouri (2017)
- Secrets & Lies (1996)
- Guess Who's Coming to Dinner (1967)


In [12]:
import pandas as pd

# Assuming ratings_df contains the ratings data
# Filter ratings for User 10
user_10_ratings = ratings_df[ratings_df['userId'] == 1]

# Merge with movies_df to get movie titles
movies_watched_by_user_10 = pd.merge(user_10_ratings, movies_df, on='movieId', how='inner')

# Print movie titles
print("Movies watched by User 1:")
for movie_title in movies_watched_by_user_10['title']:
    print("- " + movie_title)


Movies watched by User 1:
- Toy Story (1995)
- Grumpier Old Men (1995)
- Heat (1995)
- Seven (a.k.a. Se7en) (1995)
- Usual Suspects, The (1995)
- From Dusk Till Dawn (1996)
- Bottle Rocket (1996)
- Braveheart (1995)
- Rob Roy (1995)
- Canadian Bacon (1995)
- Desperado (1995)
- Billy Madison (1995)
- Clerks (1994)
- Dumb & Dumber (Dumb and Dumber) (1994)
- Ed Wood (1994)
- Star Wars: Episode IV - A New Hope (1977)
- Pulp Fiction (1994)
- Stargate (1994)
- Tommy Boy (1995)
- Clear and Present Danger (1994)
- Forrest Gump (1994)
- Jungle Book, The (1994)
- Mask, The (1994)
- Blown Away (1994)
- Dazed and Confused (1993)
- Fugitive, The (1993)
- Jurassic Park (1993)
- Mrs. Doubtfire (1993)
- Schindler's List (1993)
- So I Married an Axe Murderer (1993)
- Three Musketeers, The (1993)
- Tombstone (1993)
- Dances with Wolves (1990)
- Batman (1989)
- Silence of the Lambs, The (1991)
- Pinocchio (1940)
- Fargo (1996)
- Mission: Impossible (1996)
- James and the Giant Peach (1996)
- Space Jam (1

In [13]:
import pandas as pd
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBasic, SVD, BaselineOnly, CoClustering
from surprise.accuracy import rmse, mae
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split

# Load data from CSV files
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

# Create Surprise Reader object
reader = Reader(rating_scale=(1, 5))

# Create Surprise Dataset
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Define models
models = {
    "KNNBasic": KNNBasic(),
    "SVD": SVD(),
    "BaselineOnly": BaselineOnly(),
    "CoClustering": CoClustering()
}

# Train and evaluate models
results = {}
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    model.fit(trainset)
    predictions = model.test(testset)
    results[model_name] = {
        "RMSE": rmse(predictions),
        "MAE": mae(predictions)
    }

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display results
print("\nPerformance Results:")
print(results_df)


Evaluating KNNBasic...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9561
MAE:  0.7325
Evaluating SVD...
RMSE: 0.8808
MAE:  0.6784
Evaluating BaselineOnly...
Estimating biases using als...
RMSE: 0.8785
MAE:  0.6778
Evaluating CoClustering...
RMSE: 0.9468
MAE:  0.7331

Performance Results:
      KNNBasic       SVD  BaselineOnly  CoClustering
RMSE  0.956073  0.880797      0.878510      0.946841
MAE   0.732520  0.678382      0.677786      0.733073


In [16]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data from CSV files
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

# Merge movie genres into a single string
movies_df['genres'] = movies_df['genres'].fillna('')
movies_df['genres'] = movies_df['genres'].apply(lambda x: ' '.join(x.split('|')))

# Merge ratings with movies
data = pd.merge(ratings_df, movies_df, on='movieId')

# Feature Engineering
X = data[['movieId', 'title', 'genres']]  # Use 'data' instead of 'movies_df'
y = data['rating']

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(X['genres'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predict ratings
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)



Mean Squared Error: 0.9986157839587124


In [36]:
import numpy as np

# Assuming 'user_id' column in ratings_df represents user IDs
# Let's filter ratings_df to get ratings by user 1
user_1_ratings = ratings_df[ratings_df['userId'] == 1]

# Let's merge user 1's ratings with movie data to get information about the movies
user_1_data = pd.merge(user_1_ratings, movies_df, on='movieId')

# Let's filter the movies that user 1 rated highly (you can define a threshold for what's considered highly rated)
highly_rated_movies = user_1_data[user_1_data['rating'] >= 4]

# Now, let's use these highly rated movies to get recommendations
recommendations = pd.DataFrame(columns=['movieId', 'title', 'predicted_rating'])

for movie_id, title in zip(highly_rated_movies['movieId'], highly_rated_movies['title']):
    # Get the TF-IDF vector for the movie's genres
    movie_idx = movies_df[movies_df['movieId'] == movie_id].index[0]
    tfidf_vector = X_tfidf[movie_idx]
    
    # Predict the rating for this movie using the trained model
    predicted_rating = rf_regressor.predict(tfidf_vector.reshape(1, -1))[0]
    
    recommendations = pd.concat([recommendations, pd.DataFrame({'movieId': [movie_id], 'title': [title], 'predicted_rating': [predicted_rating]})], ignore_index=True)

# Sort the recommendations by predicted rating in descending order
recommendations = recommendations.sort_values(by='predicted_rating', ascending=False)

# Get the top 10 recommendations
top_10_recommendations = recommendations.head(10)

print("Top 10 Recommendations for User 1:")
print(top_10_recommendations[['title']].to_string(index=False, justify='left'))



  recommendations = pd.concat([recommendations, pd.DataFrame({'movieId': [movie_id], 'title': [title], 'predicted_rating': [predicted_rating]})], ignore_index=True)


Top 10 Recommendations for User 1:
title                                                         
                       Teenage Mutant Ninja Turtles III (1993)
                                              Ladyhawke (1985)
                                          Wayne's World (1992)
                                               Scream 3 (2000)
                                                    JFK (1991)
Teenage Mutant Ninja Turtles II: The Secret of the Ooze (1991)
                                               Red Dawn (1984)
                                  Good Morning, Vietnam (1987)
                                         Grumpy Old Men (1993)
                                                   Hook (1991)


In [37]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge

# Define a dictionary to store the regressors
regressors = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR(),
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge()
}

# Dictionary to store MSE values for each regressor
mse_scores = {}

# Iterate over each regressor
for name, regressor in regressors.items():
    # Train the regressor
    regressor.fit(X_train, y_train)
    
    # Predict ratings
    y_pred = regressor.predict(X_test)
    
    # Compute mean squared error
    mse = mean_squared_error(y_test, y_pred)
    
    # Store MSE in dictionary
    mse_scores[name] = mse

# Print MSE values for each regressor
for name, mse in mse_scores.items():
    print(f"{name}: MSE = {mse}")


Random Forest Regressor: MSE = 0.9986157839587124
Gradient Boosting Regressor: MSE = 1.013223073678727
Support Vector Regressor: MSE = 1.0195395514788514
Linear Regression: MSE = 1.0317016766436247
Ridge Regression: MSE = 1.0316982615903076


In [45]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data from CSV files
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

# Merge movie genres into a single string
movies_df['genres'] = movies_df['genres'].fillna('')
movies_df['genres'] = movies_df['genres'].apply(lambda x: ' '.join(x.split('|')))

# Merge ratings with movies
data = pd.merge(ratings_df, movies_df, on='movieId')

# Feature Engineering
X = data[['movieId', 'title', 'genres']]  # Use 'data' instead of 'movies_df'
y = data['rating']

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(X['genres'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize base regressor models
base_regressor_1 = RandomForestRegressor(n_estimators=100, random_state=42)
base_regressor_2 = LinearRegression()
base_regressor_3 = GradientBoostingRegressor(random_state=42)
base_regressor_4 = SVR()
base_regressor_5 = Ridge()

# BaggingRegressor
bagged_regressor_1 = BaggingRegressor(base_regressor_1, n_estimators=10, random_state=42)
bagged_regressor_2 = BaggingRegressor(base_regressor_2, n_estimators=10, random_state=42)
bagged_regressor_3 = BaggingRegressor(base_regressor_3, n_estimators=10, random_state=42)
bagged_regressor_4 = BaggingRegressor(base_regressor_4, n_estimators=10, random_state=42)
bagged_regressor_5 = BaggingRegressor(base_regressor_5, n_estimators=10, random_state=42)

# Train the BaggingRegressors
bagged_regressor_1.fit(X_train, y_train)
bagged_regressor_2.fit(X_train, y_train)
bagged_regressor_3.fit(X_train, y_train)
bagged_regressor_4.fit(X_train, y_train)
bagged_regressor_5.fit(X_train, y_train)

# Predict ratings
y_pred_1 = bagged_regressor_1.predict(X_test)
y_pred_2 = bagged_regressor_2.predict(X_test)
y_pred_3 = bagged_regressor_3.predict(X_test)
y_pred_4 = bagged_regressor_4.predict(X_test)
y_pred_5 = bagged_regressor_5.predict(X_test)

# Average the predictions
y_pred_avg = (y_pred_1 + y_pred_2 + y_pred_3 + y_pred_4 + y_pred_5) / 5

# Evaluate the model
mse = mean_squared_error(y_test, y_pred_avg)
print("Mean Squared Error:", mse)


In [38]:
# Filter ratings by user 1
user_1_ratings = ratings_df[ratings_df['userId'] == 1]

# Merge user 1's ratings with movie data
user_1_data = pd.merge(user_1_ratings, movies_df, on='movieId')

# Filter highly rated movies by user 1
highly_rated_movies = user_1_data[user_1_data['rating'] >= 4]

# Initialize DataFrame to store predicted ratings
predictions = pd.DataFrame(columns=['movieId', 'title', 'predicted_rating'])

# Iterate over highly rated movies
for movie_id, title in zip(highly_rated_movies['movieId'], highly_rated_movies['title']):
    # Get TF-IDF vector for movie genres
    movie_idx = movies_df[movies_df['movieId'] == movie_id].index[0]
    tfidf_vector = X_tfidf[movie_idx]
    
    # Predict rating using bagged regressor
    predicted_rating = bagged_regressor.predict(tfidf_vector.reshape(1, -1))[0]
    
    # Store prediction in DataFrame
    predictions = predictions.append({'movieId': movie_id, 'title': title, 'predicted_rating': predicted_rating}, ignore_index=True)

# Sort predictions by predicted rating
predictions = predictions.sort_values(by='predicted_rating', ascending=False)

# Print top recommendations for user 1
print("Top Recommendations for User 1 (Bagged Ensemble Model):")
print(predictions[['title', 'predicted_rating']].head(10))


NameError: name 'bagged_regressor' is not defined