In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'movie-recommendation-system:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3375918%2F5872805%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240610%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240610T184202Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D39eb49bbed1e9627e2b13438039a8726b1e0bc96f8a3f1b1a45b6e1fea25715bebb48219090bf158af90d0bdce916516e7a21dfe6f15b40c962edfc2c155b8c2f472a6e35f77129cbb09030edaaf2826430ff1a67a25f2ab8050ebc86138f0c34061a1aa845131c4f7f0d2b51e9e35010e85a4d365a4cc78bfbea1d4105525833fef2bdc339417eee55801977405e1bb99bb6fdd6738ea97a175b6542ac1b3ae3a35a10e29c98b33f07296f8a97f3c93918ecb14b593da01869eca3709385129c81adaa0c3ac6104a417cc6d68c6d8518356c302454998748c29e67b74c672b5dfd353da4211685346bcfb5ba43e9abf25d0cb430ff1d4366fedf09f62cf9e46'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading movie-recommendation-system, 172958815 bytes compressed
Downloaded and uncompressed: movie-recommendation-system
Data source import complete.


In [None]:
!pip install scikit-surprise tensorflow wurlitzer



In [None]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
import scipy.sparse
import tensorflow as tf

from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split as sk_train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import normalize
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import os
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten, Conv1D, MaxPooling1D, ZeroPadding1D
from math import sqrt

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/movie-recommendation-system/ratings.csv
/kaggle/input/movie-recommendation-system/movies.csv


In [None]:
movies_file = '/kaggle/input/movie-recommendation-system/movies.csv'
movies = pd.read_csv(movies_file)
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
users_file = '/kaggle/input/movie-recommendation-system/ratings.csv'
users = pd.read_csv(users_file)
users.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
6,1,1175,3.5,1147868826
7,1,1217,3.5,1147878326
8,1,1237,5.0,1147868839
9,1,1250,4.0,1147868414


In [None]:
movie_dataframe = pd.merge(users, movies, on='movieId')

movie_dataframe.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,3,296,5.0,1439474476,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,4,296,4.0,1573938898,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,5,296,4.0,830786155,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4,7,296,4.0,835444730,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
5,8,296,5.0,890489713,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
6,10,296,4.5,1227571308,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
7,12,296,5.0,1119354604,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
8,13,296,5.0,1238029599,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
9,14,296,5.0,1506208897,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller


In [None]:
print('#Total dataset: %d' % len(movie_dataframe))
movie_dataframe['timestamp'] = pd.to_datetime(movie_dataframe['timestamp'])

# Ordenar os dados pela coluna de timestamp
movie_dataframe = movie_dataframe.sort_values(by='timestamp')

# Definir intervalos de tempo (por exemplo, amostrar a cada semana)
intervalo_de_amostragem = 'W'  # 'W' para semana, 'M' para mês, 'D' para dia, etc.

# Selecionar amostras para cada intervalo de tempo
sample_movie_df = movie_dataframe.groupby(pd.Grouper(key='timestamp', freq=intervalo_de_amostragem)).apply(lambda x: x.sample(frac=0.05))  # 5% de amostragem
sample_movie_df['timestamp'] = sample_movie_df['timestamp'].astype(int)
sample_movie_df.head(10)
min_samples_per_user = 2

# Filtrar os usuários que têm pelo menos o número mínimo de amostras
valid_users = sample_movie_df['userId'].value_counts()[sample_movie_df['userId'].value_counts() >= min_samples_per_user].index

# Filtrar o DataFrame para incluir apenas esses usuários
sample_movie_df_filtered = sample_movie_df[sample_movie_df['userId'].isin(valid_users)]
print('#Sample dataset: %d' % len(sample_movie_df_filtered))

movie_train_df, movie_test_df = sk_train_test_split(sample_movie_df_filtered,
                                   stratify=sample_movie_df_filtered['userId'],
                                   test_size=0.20,
                                   random_state=42)
print('#Train set: %d' % len(movie_train_df))
print('#Test set: %d' % len(movie_test_df))

#Total dataset: 25000095
#Sample dataset: 1225939
#Train set: 980751
#Test set: 245188


In [None]:
class ColaborativeFilteringRecommender:
    def __init__(self, train_data):
        self.train_data = train_data
        self.algo = None

    def prepare_data(self):
        # Definir o esquema dos dados de entrada
        reader = Reader(rating_scale=(self.train_data['rating'].min(), self.train_data['rating'].max()))
        data = Dataset.load_from_df(self.train_data[['userId', 'movieId', 'rating']], reader)
        return data

    def train(self):
        data = self.prepare_data()
        trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

        # Treinar o algoritmo SVD
        self.algo = SVD()
        self.algo.fit(trainset)

        # Avaliar o modelo
        self.evaluate(testset)

    def evaluate(self, testset):
        # Avaliar o modelo no conjunto de teste
        predictions = self.algo.test(testset)

        # RMSE e MAE
        rmse = accuracy.rmse(predictions)
        mae = accuracy.mae(predictions)

        # Precision e Recall
        precisions, recalls = self.precision_recall_at_k(predictions, k=5, threshold=3.5)

        # Média de precisão e recall
        mean_precision = sum(prec for prec in precisions.values()) / len(precisions)
        mean_recall = sum(rec for rec in recalls.values()) / len(recalls)

        print(f'EQMR: {rmse}')
        print(f'EMA: {mae}')
        print(f'Precisão: {mean_precision}')
        print(f'Revocação: {mean_recall}')

    def precision_recall_at_k(self, predictions, k=10, threshold=3.5):
        # Função para calcular precisão e recall em k
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = {}
        recalls = {}

        for uid, user_ratings in user_est_true.items():
            # Ordenar as previsões do usuário
            user_ratings.sort(key=lambda x: x[0], reverse=True)
            # Considerar as top-k previsões
            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
            n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])

            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

        return precisions, recalls

    def predict(self, userId, movieId):
        # Prever a avaliação de um usuário para um filme específico
        return self.algo.predict(userId, movieId).est

    def recommend(self, userId, n=10):
        # Obter recomendações para um usuário
        user_movies = self.train_data[self.train_data['userId'] == userId]['movieId'].tolist()
        all_movies = self.train_data['movieId'].unique()
        predictions = [(movieId, self.predict(userId, movieId)) for movieId in all_movies if movieId not in user_movies]
        predictions.sort(key=lambda x: x[1], reverse=True)
        return predictions[:n]

In [None]:
cf_recommender = ColaborativeFilteringRecommender(sample_movie_df_filtered)
cf_recommender.train()

RMSE: 0.9114
MAE:  0.7000
EQMR: 0.9113767522809759
EMA: 0.700020664313897
Precisão: 0.6088707154143318
Revocação: 0.5852822616909387


In [None]:
class CNNRecommender:
    def __init__(self, cf_model):
        self.cf_model = cf_model
        self.cnn_model = None
        self.user_map = {}
        self.movie_map = {}

    def prepare_data(self):
        # Prepare os dados usando o modelo de filtro colaborativo
        data = self.cf_model.prepare_data()
        trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

        user_ids = self.cf_model.train_data['userId'].unique()
        movie_ids = self.cf_model.train_data['movieId'].unique()

        n_users = len(user_ids)
        n_movies = len(movie_ids)

        self.user_map = {user_id: idx for idx, user_id in enumerate(user_ids)}
        self.movie_map = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

        X_train = []
        y_train = []

        for _, row in self.cf_model.train_data.iterrows():
            user_idx = self.user_map[row['userId']]
            movie_idx = self.movie_map[row['movieId']]
            rating = row['rating']
            X_train.append([user_idx, movie_idx])
            y_train.append(rating)

        X_train = np.array(X_train)
        y_train = np.array(y_train)

        X_test = []
        y_test = []

        for (uid, mid, true_r) in testset:
            if uid in self.user_map and mid in self.movie_map:
                user_idx = self.user_map[uid]
                movie_idx = self.movie_map[mid]
                X_test.append([user_idx, movie_idx])
                y_test.append(true_r)

        X_test = np.array(X_test)
        y_test = np.array(y_test)

        return X_train, y_train, X_test, y_test

    def train(self):
        X_train, y_train, X_test, y_test = self.prepare_data()

        # Construir o modelo CNN
        n_users = X_train[:, 0].max() + 1
        n_movies = X_train[:, 1].max() + 1

        self.cnn_model = Sequential()
        self.cnn_model.add(Embedding(input_dim=n_users, output_dim=50, input_length=2))
        self.cnn_model.add(ZeroPadding1D(padding=1))
        self.cnn_model.add(Conv1D(32, 3, activation='relu'))
        self.cnn_model.add(MaxPooling1D(2))
        self.cnn_model.add(Flatten())
        self.cnn_model.add(Dense(64, activation='relu'))
        self.cnn_model.add(Dense(1))

        self.cnn_model.compile(optimizer='adam', loss='mean_squared_error')

        self.cnn_model.fit(X_train, y_train, epochs=3, batch_size=32)

        self.evaluate(X_test, y_test)

    def evaluate(self, X_test, y_test):
        # Fazendo previsões
        y_pred = self.cnn_model.predict(X_test)

        # Calcular RMSE e MAE
        rmse = np.sqrt(np.mean((y_test - y_pred.flatten()) ** 2))
        mae = np.mean(np.abs(y_test - y_pred.flatten()))

        print(f'EQMR: {rmse}')
        print(f'EMA: {mae}')

        # Calculando Precision e Recall
        y_pred_binary = [1 if pred >= 3.5 else 0 for pred in y_pred.flatten()]
        y_true_binary = [1 if true >= 3.5 else 0 for true in y_test]

        precision = sum([1 for yt, yp in zip(y_true_binary, y_pred_binary) if yt == 1 and yp == 1]) / sum(y_pred_binary)
        recall = sum([1 for yt, yp in zip(y_true_binary, y_pred_binary) if yt == 1 and yp == 1]) / sum(y_true_binary)

        print(f'Precisão: {precision}')
        print(f'Revocação: {recall}')

    def predict(self, userId, movieId):
        user_idx = self.user_map[userId]
        movie_idx = self.movie_map[movieId]
        return self.cnn_model.predict([[user_idx, movie_idx]]).flatten()[0]

    def recommend(self, userId, n=10):
        user_idx = self.user_map[userId]
        all_movies = self.cf_model.train_data['movieId'].unique()
        predictions = [
            (movie_id, self.predict(user_idx, movie_id))
            for movie_id in all_movies
            if movie_id not in self.cf_model.train_data[
                self.cf_model.train_data['userId'] == userId
            ]['movieId'].tolist()
        ]
        predictions.sort(key=lambda x: x[1], reverse=True)
        return predictions[:n]

In [None]:
cnn_recommender = CNNRecommender(cf_recommender)
cnn_recommender.train()

Epoch 1/3
Epoch 2/3
Epoch 3/3
EQMR: 0.7463394946731559
EMA: 0.5449784967545062
Precisão: 0.8727521123832435
Revocação: 0.7689600355878871
