#### Import necessary libraries

In [158]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

#### Load Data

In [159]:
df_links=pd.read_csv('Data/links.csv')
df_movies=pd.read_csv('Data/movies.csv')
df_ratings=pd.read_csv('Data/ratings.csv')
df_tags=pd.read_csv('Data/tags.csv')

#### Understanding the Data

In [None]:
# Explore the links dataset
df_links.head()  # show first few rows

In [None]:
print(df_links.info())          # Get info on data types and non-null counts
print(df_links.describe())     # Summary statistics for numerical columns
print(df_links.isnull().sum())  # Check for missing values


In [None]:
# Explore the movies dataset
df_movies.head()  # show first few rows

In [None]:
print(df_movies.info())          # Get info on data types and non-null counts
print(df_movies.describe())     # Summary statistics for numerical columns
print(df_movies.isnull().sum())  # Check for missing values

In [None]:
# Explore the ratings dataset
df_ratings.head()  # show first few rows

In [None]:
print(df_ratings.info())         # Get info on data types and non-null counts
print(df_ratings.describe())     # Summary statistics for numerical columns
print(df_ratings.isnull().sum()) # Check for missing values

In [None]:
# Explore the tags dataset
df_tags.head()  # show first few rows

In [None]:
print(df_tags.info())         # Get info on data types and non-null counts
print(df_tags.describe())     # Summary statistics for numerical columns
print(df_tags.isnull().sum()) # Check for missing values

#### Handle missing data

In [168]:
df_links = df_links.dropna() # drop missing rows

In [None]:
df_links.isnull().sum() #check the missing data again

In [None]:
df_links.describe()

#### Check for duplicates

In [None]:
# Duplicates in links.csv
df_links.duplicated().sum()

In [None]:
# Duplicates in movies.csv
df_movies.duplicated().sum()

In [None]:
# Duplicates in ratings.csv
df_ratings.duplicated().sum()

In [None]:
# Duplicates in tags.csv
df_tags.duplicated().sum()

#### Convert data to correct data types

In [175]:
df_ratings['timestamp'] = pd.to_datetime(df_ratings['timestamp'])    # Convert to datetime
df_tags['timestamp'] = pd.to_datetime(df_tags['timestamp'])

In [None]:
df_tags.head()

#### Handling outliers

In [None]:
from scipy import stats

# Calculate the Z-scores for the column
z_scores = np.abs(stats.zscore(df_ratings['rating']))

# Set a threshold for identifying outliers
threshold = 3

# Find rows where Z-score is greater than the threshold
outliers = df_ratings[z_scores > threshold]
print(outliers)

No outliers detected

#### Deal with Movies.csv genre column

In [None]:
df_movies['genres'] = df_movies['genres'].str.split('|')
df_exploded_movies = df_movies.explode('genres')
df_exploded_movies

#### One-hot encoding

In [None]:
# Pivot to create a one-hot encoded DataFrame for genres
genres_encoding = df_exploded_movies.pivot_table(index='movieId', columns='genres', aggfunc=lambda x: 1, fill_value=0)

# Flatten the multi-level column index resulting from pivot to get simple column names
genres_encoding.columns = genres_encoding.columns.get_level_values(1)
genres_encoding = genres_encoding.add_prefix('genre_')

# Merge the one-hot encoded genres with the original movies DataFrame 
df_movies_encoded = pd.merge(df_movies.drop(columns='genres').drop_duplicates(), genres_encoding, on='movieId', how='left')

df_movies_encoded

#### Merge the data

In [None]:
df_ratings_movies_merged= pd.merge(df_movies_encoded, df_ratings, on='movieId', how='inner') # merge ratings.csv with movies.csv
df_ratings_movies_merged

In [None]:
df_merged=pd.merge(df_ratings_movies_merged, df_tags, on=['movieId', 'userId'], how='left')
df_merged

#### Deal with the NaN values in tags

In [None]:
# Create a binary indicator column for the presence of tags
df_merged['tag_present'] = df_merged['tag'].notna().astype(int)

df_merged.head()

##### Refine the User-Item Matrix

In [None]:
user_item_matrix = df_merged.pivot_table(index='userId', columns='movieId', values='rating')
user_item_matrix_filled = user_item_matrix.fillna(0)
user_item_matrix_filled

In [None]:
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds
import scipy.sparse as sparse


# Convert the user-item matrix to a sparse matrix format and ensure data is float
user_item_matrix_sparse = sparse.csr_matrix(user_item_matrix_filled.values.astype(float))

# Apply SVD on the sparse matrix
U, sigma, Vt = svds(user_item_matrix_sparse, k=50)
sigma = np.diag(sigma)

# Reconstruct the predicted ratings matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

# Convert back to DataFrame for further analysis
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

# Print the predicted ratings to verify
print(predicted_ratings_df.head())

### Visualizations

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_ratings['rating'], bins=10, kde=True)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
most_rated = df_ratings.groupby('movieId').size().nlargest(10)
most_rated_movies = df_movies[df_movies['movieId'].isin(most_rated.index)]

plt.figure(figsize=(12, 6))
sns.barplot(x='title', y=most_rated.values, data=most_rated_movies)
plt.title('Top 10 Most Rated Movies')
plt.xlabel('Movie Title')
plt.ylabel('Number of Ratings')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
avg_genre_ratings = df_exploded_movies.merge(df_ratings, on='movieId').groupby('genres')['rating'].mean().sort_values()

plt.figure(figsize=(12, 6))
avg_genre_ratings.plot(kind='bar')
plt.title('Average Rating by Genre')
plt.xlabel('Genre')
plt.ylabel('Average Rating')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
ratings_count_by_user = df_ratings['userId'].value_counts()

plt.figure(figsize=(10, 6))
sns.histplot(ratings_count_by_user, bins=30, kde=True)
plt.title('Distribution of Ratings Count by User')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')
plt.show()

#### Build the Recommendation Model with KNN

In [202]:
from sklearn.neighbors import KNeighborsRegressor

# Prepare user-item matrix with NaNs filled with 0 for k-NN
user_item_matrix_filled = user_item_matrix.fillna(0)


In [None]:
# Number of neighbors
k = 5

# Initialize KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=k, metric='cosine')

# Fit the model on the user-item matrix
# Fit the k-NN model on user-item matrix
knn = KNeighborsRegressor(n_neighbors=k, metric='cosine')
knn.fit(user_item_matrix_filled.values, user_item_matrix_filled.index)

In [207]:
def recommend_movies_knn(user_id, user_item_matrix_filled, movies_df, knn_model, num_recommendations=5):
    # Retrieve the ratings for the specified user
    user_vector = user_item_matrix_filled.loc[user_id].values.reshape(1, -1)
    
    # Find the k-nearest neighbors
    distances, indices = knn_model.kneighbors(user_vector)
    
    # Calculate the mean rating for all movies based on neighbors
    similar_users = user_item_matrix_filled.iloc[indices.flatten()]
    mean_ratings = similar_users.mean(axis=0)
    
    # Filter out movies already rated by the user
    unrated_movies = user_item_matrix_filled.loc[user_id].isna()
    recommendations = mean_ratings[unrated_movies].sort_values(ascending=False).head(num_recommendations)
    
    # Return the movie titles for the top recommendations
    return movies_df[movies_df['movieId'].isin(recommendations.index)][['movieId', 'title']]

Apply the Recommendation Function

In [None]:
user_id = 2  # Replace with any user ID from your dataset
recommendations = recommend_movies_knn(user_id, user_item_matrix_filled, df_movies, knn, num_recommendations=5)
print(recommendations)

#### Evaluate the Model
Lets use regression metrics like RMSE and MAE

In [194]:
# drop the NaNs for comparison
predicted_ratings_filled = predicted_ratings_df[user_item_matrix.notna()]
actual_ratings = user_item_matrix[user_item_matrix.notna()]

actual_ratings_no_nan = actual_ratings.dropna()
predicted_ratings_no_nan = predicted_ratings_filled.dropna()

# If dropping leads to mismatched indices, align the matrices before evaluating
actual_ratings_aligned, predicted_ratings_aligned = actual_ratings_no_nan.align(predicted_ratings_no_nan, join='inner', axis=0)

In [None]:
print(actual_ratings_aligned.shape)
print(predicted_ratings_aligned.shape)

In [197]:
# Verify Alignment
actual_ratings_no_nan, predicted_ratings_no_nan = actual_ratings.align(predicted_ratings_filled, join='inner', axis=0)

In [198]:
# Instead of dropping NaNs, fill them with a specific value
actual_ratings_filled = actual_ratings.fillna(0)
predicted_ratings_filled = predicted_ratings_filled.fillna(0)

actual_ratings_aligned, predicted_ratings_aligned = actual_ratings_filled.align(predicted_ratings_filled, join='inner', axis=0)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Optionally fill NaNs (or handle as per project context)
actual_ratings_filled = actual_ratings.fillna(0)
predicted_ratings_filled = predicted_ratings_filled.fillna(0)

# Align the dataframes and check shape
actual_ratings_aligned, predicted_ratings_aligned = actual_ratings_filled.align(predicted_ratings_filled, join='inner', axis=0)
print(actual_ratings_aligned.shape, predicted_ratings_aligned.shape)

# Calculate RMSE and MAE only if the DataFrames are non-empty
if actual_ratings_aligned.shape[0] > 0:
    rmse = np.sqrt(mean_squared_error(actual_ratings_aligned, predicted_ratings_aligned))
    mae = mean_absolute_error(actual_ratings_aligned, predicted_ratings_aligned)
    
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
else:
    print("Error: No overlapping ratings between actual and predicted data. Ensure proper alignment or handling of missing values.")

Apply the Recommendation Function

In [None]:
user_id = 5  # Replace with any user ID from your dataset
recommendations = recommend_movies(predicted_ratings_df, user_id, df_movies, num_recommendations=5)
print(recommendations)