In [1]:
!pip install implicit



In [4]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from google.colab import drive
from scipy.sparse import coo_matrix, csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
import os
import pickle
import implicit
from implicit.als import AlternatingLeastSquares
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from keras import layers
from tensorflow import keras
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import random
import time

# Code

In [5]:
# @title Функции

def top_to_parametr(k: int, n=10) -> pd.DataFrame:
    movie_id = k
    correlations_for_movie = movie_correlation_matrix[movie_id]
    top_10_movies = correlations_for_movie.sort_values(ascending=False)[1:n+1]
    top_10_movies = pd.merge(movie_details[['movieId', 'title']], top_10_movies, on='movieId', how='inner').sort_values(ascending=False, by=k)
    top_10_movies['relevance'] = top_10_movies[k]
    top_10_movies = top_10_movies.reset_index()
    del top_10_movies[k]
    del top_10_movies['index']
    return top_10_movies

def priority(movie_id: int | list = None, label: int | list = None, delete: bool = False) -> pd.DataFrame:
    cache_model_path = '/content/drive/MyDrive/ml-latest/movies_priority.pkl'

    if os.path.exists(cache_model_path):
        movies_priority = joblib.load(cache_model_path)
    else:
        movies_priority = pd.DataFrame(columns=['movieId', 'label'])
        joblib.dump(movies_priority, cache_model_path)

    if not delete:
        if isinstance(movie_id, list) and isinstance(label, list):
            for m_id, l in zip(movie_id, label):
                if m_id in movies_priority['movieId'].values:
                    existing_label = movies_priority[movies_priority['movieId'] == m_id]['label'].values[0]
                    raise PermissionError(f"Фильм с {m_id} уже в списке и имеет приоритетность {existing_label}. Сперва удалите его.")
                else:
                    new_data = pd.DataFrame({'movieId': [m_id], 'label': [l]})
                    movies_priority = pd.concat([movies_priority, new_data], ignore_index=True)
        elif movie_id is not None and label is not None:
            if movie_id in movies_priority['movieId'].values:
                existing_label = movies_priority[movies_priority['movieId'] == movie_id]['label'].values[0]
                raise PermissionError(f"Фильм с {movie_id} уже в списке и имеет приоритетность {existing_label}. Сперва удалите его.")
            else:
                new_data = pd.DataFrame({'movieId': [movie_id], 'label': [label]})
                movies_priority = pd.concat([movies_priority, new_data], ignore_index=True)
        else:
            raise ValueError("Ошибка: Не указаны необходимые параметры для добавления.")

    elif delete:
        if isinstance(movie_id, list):
            movies_priority = movies_priority[~movies_priority['movieId'].isin(movie_id)]
        elif movie_id is not None:
            movies_priority = movies_priority[movies_priority['movieId'] != movie_id]
        else:
            raise ValueError("Ошибка: Не указаны movieId для удаления.")

    joblib.dump(movies_priority, cache_model_path)
    return movies_priority

def top_IMDb_score(n=10) -> pd.DataFrame:
    C = movie_ratings['average_rating'].mean()
    m = movie_ratings['vote_count'].quantile(0.99)
    def weighted_rating(x, m=m, C=C):
        flag = x['flag']
        v = x['vote_count']
        R = x['average_rating']
        if flag == 1:
            return 5.0
        elif flag == -1:
            return 0.0
        else:
            return ((v/(v+m) * R) + (m/(m+v) * C))
    q_movies = movie_ratings.copy().loc[(movie_ratings['vote_count'] >= m) & (movie_ratings['flag'] != -1) | (movie_ratings['flag'] == 1)]
    q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
    q_movies = q_movies.sort_values('score', ascending=False)
    q_movies = q_movies[['movieId', 'vote_count', 'average_rating', 'score']][0:n]
    q_movies = pd.merge(movie_details[['movieId', 'title', 'first_genre']], q_movies, on='movieId', how='inner').sort_values(ascending=False, by='score')
    q_movies = q_movies.reset_index()
    del q_movies['index']
    return q_movies

def top_in_genre_IMDb_score(n=10) -> pd.DataFrame:
    C = movie_ratings['average_rating'].mean()
    m = movie_ratings['vote_count'].quantile(0.99)
    def weighted_rating(x, m=m, C=C):
        flag = x['flag']
        v = x['vote_count']
        R = x['average_rating']
        if flag == 1:
            return 5.0
        elif flag == -1:
            return 0.0
        else:
            return ((v/(v+m) * R) + (m/(m+v) * C))
    q_movies = pd.merge(movie_ratings[['movieId', 'vote_count', 'average_rating', 'flag']],
                        movie_details[['movieId', 'title', 'first_genre']],
                        on='movieId', how='inner')
    q_movies = q_movies.loc[(q_movies['vote_count'] >= m) & (q_movies['flag'] != -1) | (q_movies['flag'] == 1)]
    q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
    q_movies = q_movies.sort_values('score', ascending=False)
    q_movies = q_movies[['movieId', 'title', 'first_genre', 'vote_count', 'average_rating', 'score']]
    q_movies = q_movies.groupby('first_genre').head(1)
    q_movies = q_movies.reset_index(drop=True)
    q_movies = q_movies[0:n]
    return q_movies

def get_top_n_recommendations(user_id: int, n=20, min_avg_rating=3.0) -> pd.DataFrame:
    if user_id not in user_id_to_index:
        print(f"User {user_id} not found in filtered dataset.")
        return pd.DataFrame(columns=['movieId', 'predicted_rating'])

    user_idx = user_id_to_index[user_id]
    seen_movies = filtered_ratings[filtered_ratings['userId'] == user_id]['movieId'].values
    unseen_movie_ids = np.setdiff1d(movie_ids, seen_movies)

    movie_avg_ratings = ratings.groupby('movieId')['rating'].mean()
    user_vector = np.dot(sigma, Vt[:, user_idx])

    user_bias = user_means.get(user_idx, 0.0)

    predictions = []
    for movie_id in unseen_movie_ids:
        if movie_id in movie_avg_ratings and movie_avg_ratings[movie_id] >= min_avg_rating:
            movie_idx = movie_id_to_index.get(movie_id)
            if movie_idx is None:
                continue
            pred_rating = np.dot(U[movie_idx], user_vector) + user_bias
            predictions.append((movie_id, pred_rating))

    if not predictions:
        return top_IMDb_score(n)

    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    df = pd.DataFrame(top_n, columns=['movieId', 'predicted_rating'])
    df = pd.merge(movie_details[['movieId', 'title']], df, on='movieId', how='inner')
    return df

#def get_als_recommendations(user_id, n=20, min_avg_rating=3.0):
#    if user_id not in user_id_to_index:
#        print(f"User {user_id} not found.")
#        pd.DataFrame(columns=['movieId', 'predicted_rating'])
#    user_idx = user_id_to_index[user_id]
#
#    if user_idx >= als_ratings.T.shape[0]:
#        print(f"user_idx {user_idx} out of bounds for ALS matrix with shape {als_ratings.T.shape}")
#        pd.DataFrame(columns=['movieId', 'predicted_rating'])
#    user_rated_movies = filtered_ratings[filtered_ratings['userId'] == user_id]['movieId'].values
#    movie_avg_ratings = ratings.groupby('movieId')['rating'].mean()
#    user_items_csr = als_ratings.T.tocsr()
#    user_items_for_recommendation = user_items_csr[user_idx]
#    recommended_movie_indices, recommended_scores = als_model.recommend(
#        user_idx,
#        user_items_for_recommendation,
#        N=100000,
#        filter_items=[
#            movie_id_to_index[mid] for mid in user_rated_movies if mid in movie_id_to_index
#        ]
#    )
#    results = []
#
#    index_to_movie_id = {index: movie_id for movie_id, index in movie_id_to_index.items()}
#    results = []
#    for movie_idx, score in zip(recommended_movie_indices, recommended_scores):
#        if movie_idx in index_to_movie_id:
#            movie_id = index_to_movie_id[movie_idx]
#            if movie_id in movie_avg_ratings and movie_avg_ratings[movie_id] >= min_avg_rating:
#                results.append((movie_id, score))
#        if len(results) >= n:
#            break
#    df = pd.DataFrame(results, columns=['movieId', 'predicted_rating'])
#    df = pd.merge(movie_details[['movieId', 'title']], df, on='movieId', how='inner')
#    return df

def hybrid_user_recommendations(user_id, n=20, min_avg_rating=3.0, alpha=0.5):
    svd_df = get_top_n_recommendations(user_id, n=1000, min_avg_rating=min_avg_rating)
    imdb_df = top_IMDb_score(n=1000)

    hybrid_df = pd.merge(svd_df, imdb_df[['movieId', 'score']], on='movieId', how='inner')
    hybrid_df['predicted_rating'] = alpha * hybrid_df['predicted_rating'] + (1 - alpha) * hybrid_df['score']
    hybrid_df = hybrid_df.sort_values(by='predicted_rating', ascending=False).head(n)

    return hybrid_df[['movieId', 'title', 'predicted_rating']]

def get_genre_recommended_movies(userId, n=10):
    user_ratings = ratings[ratings['userId'] == userId]
    user_movie_ids = user_ratings['movieId'].unique()

    user_genres = []
    for movie_id in user_movie_ids:
        genres_string = movie_details[movie_details['movieId'] == movie_id]['genres'].values[0]
        user_genres.extend(genres_string.split())
    user_genres = list(set(user_genres))

    movie_details['genres_list'] = movie_details['genres'].apply(lambda x: x.split())
    mlb = MultiLabelBinarizer()
    genre_matrix = mlb.fit_transform(movie_details['genres_list'])

    user_vector = mlb.transform([user_genres])
    similarity_scores = cosine_similarity(user_vector, genre_matrix).flatten()
    temp_movie_details = movie_details.copy()
    temp_movie_details['genre_similarity'] = similarity_scores
    temp_movie_details = temp_movie_details[~temp_movie_details['movieId'].isin(user_movie_ids)]
    recommended_movies = temp_movie_details[['movieId', 'title', 'genre_similarity']].sort_values(by='genre_similarity', ascending=False).head(n)

    return recommended_movies

def genre_collab_hybrid(user_id, n=20, min_avg_rating=3.5, alpha=0.5):
    user_ratings = ratings[ratings['userId'] == user_id]
    user_movie_ids = user_ratings['movieId'].unique()

    genre_recs = get_genre_recommended_movies(user_id, n=50)

    top_rated_movies = user_ratings.sort_values(by='rating', ascending=False)['movieId'].head(3).tolist()
    collab_recs = pd.DataFrame()
    for movie_id in top_rated_movies:
        try:
            recs = top_to_parametr(movie_id, n=20)
            recs['source_movie'] = movie_id
            collab_recs = pd.concat([collab_recs, recs], ignore_index=True)
        except Exception:
            continue

    hybrid = pd.merge(collab_recs, genre_recs, on='movieId', how='outer')

    hybrid['relevance'] = hybrid['relevance'].fillna(0.0)
    hybrid['genre_similarity'] = hybrid['genre_similarity'].fillna(0.0)

    hybrid['final_score'] = alpha * hybrid['relevance'] + (1 - alpha) * hybrid['genre_similarity']
    hybrid = hybrid[['movieId', 'final_score']].drop_duplicates()
    hybrid = pd.merge(hybrid[['movieId', 'final_score']], movie_details[['movieId', 'title']], on='movieId', how='inner')
    hybrid = pd.merge(hybrid, movie_ratings[['movieId', 'vote_count', 'average_rating']], on='movieId', how='inner')
    hybrid = hybrid.loc[hybrid['vote_count'] > 500, ['movieId', 'title', 'final_score', 'average_rating']]
    hybrid = hybrid.loc[hybrid['average_rating'] > min_avg_rating, ['movieId', 'title', 'final_score']]


    return hybrid.sort_values(by='final_score', ascending=False).head(n)

def recommend_movies_after_first_choice(viewed_movie_ids, n=15):
    if isinstance(viewed_movie_ids, int):
        viewed_movie_ids = [viewed_movie_ids]

    top_movies = top_in_genre_IMDb_score(n=n+5)
    top_movies = top_movies[~top_movies['movieId'].isin(viewed_movie_ids)]

    similar_movies = pd.DataFrame()
    for movie_id in viewed_movie_ids:
        try:
            recs = top_to_parametr(movie_id, n=7)
            recs['source_movie'] = movie_id
            similar_movies = pd.concat([similar_movies, recs], ignore_index=True)
        except Exception:
            continue

    combined_movies = pd.concat([top_movies[['movieId', 'title']], similar_movies[['movieId', 'title']]], ignore_index=True)
    combined_movies = combined_movies.drop_duplicates(subset='movieId')
    combined_movies = combined_movies.sample(frac=1).reset_index(drop=True)

    return combined_movies.head(n)

def get_user_movie_features(user_id, candidate_movie_ids=None, top_k=500):
    if candidate_movie_ids is None:
        rated_movies = ratings[ratings['userId'] == user_id]['movieId'].tolist()
        candidate_movie_ids = np.setdiff1d(movie_ids, rated_movies)

    movie_feats = movie_details[movie_details['movieId'].isin(candidate_movie_ids)].copy()
    movie_feats = pd.merge(movie_feats, movie_ratings, on='movieId', how='left')

    movie_feats['genres_list'] = movie_feats['genres'].apply(lambda x: x.split())
    mlb = MultiLabelBinarizer()
    genre_ohe = pd.DataFrame(mlb.fit_transform(movie_feats['genres_list']), columns=mlb.classes_)
    movie_feats = pd.concat([movie_feats, genre_ohe], axis=1)

    user_data = ratings[ratings['userId'] == user_id]
    user_genres = movie_details[movie_details['movieId'].isin(user_data['movieId'])]['genres'].apply(lambda x: x.split())
    user_genres_flat = [g for sublist in user_genres for g in sublist]
    genre_counts = pd.Series(user_genres_flat).value_counts(normalize=True)

    for genre in mlb.classes_:
        movie_feats[f'genre_frequency_in_user_{genre}'] = genre_counts.get(genre, 0)

    user_idx = user_id_to_index[user_id]
    pred_scores = []
    for mid in movie_feats['movieId']:
        if mid not in movie_id_to_index:
            pred_scores.append(0.0)
            continue
        movie_idx = movie_id_to_index[mid]
        score = np.dot(np.dot(U[movie_idx], sigma), Vt[:, user_idx])
        pred_scores.append(score)
    movie_feats['svd_predicted_rating'] = pred_scores

    return movie_feats

def build_training_dataset(num_users=50, positive_threshold=4.0, negative_samples_per_user=20):
    train_data = []
    eligible_users = ratings.groupby('userId').filter(lambda x: len(x) > 70)['userId'].unique()
    selected_users = np.random.choice(eligible_users, size=min(num_users, len(eligible_users)), replace=False)

    for user_id in selected_users:
        print(f"Processing user {user_id}")
        user_ratings = ratings[ratings['userId'] == user_id]
        positive_movies = user_ratings[user_ratings['rating'] >= positive_threshold]['movieId'].tolist()
        negative_pool = list(set(movie_ids) - set(user_ratings['movieId'].tolist()))
        negative_movies = np.random.choice(negative_pool, size=negative_samples_per_user, replace=False)

        pos_feats = get_user_movie_features(user_id, candidate_movie_ids=positive_movies)
        pos_feats['label'] = 1

        neg_feats = get_user_movie_features(user_id, candidate_movie_ids=negative_movies)
        neg_feats['label'] = 0

        train_data.append(pd.concat([pos_feats, neg_feats], ignore_index=True))

    train_df = pd.concat(train_data, ignore_index=True)
    train_df = train_df.drop(columns=train_df.select_dtypes(include=['object']).columns)
    return train_df

def train_random_forest_model(train_df):
    drop_cols = ['movieId', 'title', 'genres', 'genres_list', 'label']
    feature_cols = [col for col in train_df.columns if col not in drop_cols]

    X = train_df[feature_cols].values
    y = train_df['label'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

    rf_model = RandomForestClassifier(min_samples_leaf=20, max_depth=5, n_estimators=20, random_state=0, n_jobs=-1)

    rf_model.fit(X_train, y_train)

    y_pred = rf_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    return rf_model, feature_cols

def train_tensorflow_model(train_df):
    drop_cols = ['movieId', 'title', 'genres', 'genres_list', 'label']
    feature_cols = [col for col in train_df.columns if col not in drop_cols]
    train_df = train_df.fillna(0)
    X = train_df.drop('label', axis=1)
    X = X.drop('movieId', axis=1).values
    y = train_df['label'].values
    print(len(X[0]))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

    model = keras.Sequential([
        layers.Flatten(),
        layers.Dense(32, activation='relu'),
        layers.Dense(16, activation='relu'),
        layers.Dropout(0.4),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    history = model.fit(X_train,
                        y_train,
                        batch_size=128,
                        epochs=20,
                        validation_data=(X_test, y_test),
                        verbose=2)

    return model, feature_cols

def rf_score_for_user(user_id, model, feature_cols, top_n=100):
    movie_feats = get_user_movie_features(user_id)
    X = movie_feats[feature_cols]
    movie_feats['rf_score'] = model.predict(X)
    top_preds = movie_feats.sort_values('rf_score', ascending=False).head(top_n)
    return top_preds[['movieId', 'rf_score']]

def apply_priority_rules(df: pd.DataFrame, user_id: int, n: int = 20) -> pd.DataFrame:
    priority_path = '/content/drive/MyDrive/ml-latest/movies_priority.pkl'
    if os.path.exists(priority_path):
        priority_df = joblib.load(priority_path)
    else:
        return df.head(n)

    banned_ids = priority_df[priority_df['label'] == -1]['movieId'].tolist()
    df = df[~df['movieId'].isin(banned_ids)]

    user_movie_count = ratings[ratings['userId'] == user_id].shape[0]
    #if user_movie_count < 20:
    return df.head(n)

    #promoted_ids = priority_df[priority_df['label'] == 1]['movieId'].tolist()
    #promoted_ids = list(set(promoted_ids) - set(df['movieId']))
    #if len(promoted_ids) >= 2:
    #    promoted_sample = random.sample(promoted_ids, 2)
    #    promoted_df = movie_details[movie_details['movieId'].isin(promoted_sample)][['movieId', 'title']]
    #    promoted_df['final_score'] = -100
    #
    #    base_df = df.head(18).copy()
    #    combined_df = pd.concat([base_df, promoted_df], ignore_index=True).drop_duplicates('movieId')
    #
    #    insert_positions = sorted(random.sample(range(10, min(len(combined_df), 20)), 2))
    #    reordered = []
    #    inserted = 0
    #    for i, row in combined_df.iterrows():
    #        if i in insert_positions:
    #            reordered.append(promoted_df.iloc[inserted])
    #            inserted += 1
    #        reordered.append(row)
    #        if inserted == 2:
    #            break
    #    for j in range(i+1, len(combined_df)):
    #        reordered.append(combined_df.iloc[j])
    #    df = pd.DataFrame(reordered).drop_duplicates('movieId')
    #
    #return df.head(n)

def SVD_genre_rf_hybrid(user_id):
        svd_df = get_top_n_recommendations(user_id, n=300)
        genre_df = get_genre_recommended_movies(user_id, n=300)
        rf_df = rf_score_for_user(user_id, rf_model, feature_cols, top_n=300)

        hybrid_df = pd.merge(svd_df, genre_df, on='movieId', how='outer')
        hybrid_df = pd.merge(hybrid_df, rf_df, on='movieId', how='outer')

        for col in ['predicted_rating', 'genre_similarity', 'rf_score']:
            hybrid_df[col] = hybrid_df[col].fillna(0)

        hybrid_df['final_score'] = (
            0.4 * hybrid_df['predicted_rating'] +
            0.3 * hybrid_df['genre_similarity'] +
            0.3 * hybrid_df['rf_score']
        )

        hybrid_df = pd.merge(hybrid_df[['movieId', 'final_score']], movie_details[['movieId', 'title']], on='movieId', how='inner')

        priority_path = '/content/drive/MyDrive/ml-latest/movies_priority.pkl'
        if os.path.exists(priority_path):
            priority_df = joblib.load(priority_path)
            banned_ids = priority_df[priority_df['label'] == -1]['movieId'].tolist()
            hybrid_df = hybrid_df[~hybrid_df['movieId'].isin(banned_ids)]
        else:
            priority_df = pd.DataFrame(columns=['movieId', 'label'])

        hybrid_df = hybrid_df.sort_values(by='final_score', ascending=False)
        top_18 = hybrid_df.head(18).copy()

        promoted_ids = priority_df[priority_df['label'] == 1]['movieId'].tolist()
        promoted_ids = list(set(promoted_ids) - set(top_18['movieId']))

        if len(promoted_ids) >= 2:
            promoted_sample = random.sample(promoted_ids, 2)
            promoted_df = movie_details[movie_details['movieId'].isin(promoted_sample)][['movieId', 'title']]
            promoted_df['final_score'] = -1
            final_df = pd.concat([top_18, promoted_df], ignore_index=True)

            insert_positions = sorted(random.sample(range(10, 20), 2))
            reordered = []
            inserted = 0
            for i in range(len(final_df)):
                if inserted < 2 and i == insert_positions[inserted]:
                    reordered.append(promoted_df.iloc[inserted])
                    inserted += 1
                reordered.append(final_df.iloc[i])
            final_df = pd.DataFrame(reordered).drop_duplicates('movieId').head(20)
            return final_df

        return top_18.head(20)

def recommend_for_user(user_id):
    user_ratings = ratings[ratings['userId'] == user_id]
    rated_movies = user_ratings['movieId'].tolist()

    if len(rated_movies) == 0:
        return apply_priority_rules(top_in_genre_IMDb_score(), user_id)
    elif len(rated_movies) <= 2:
        return apply_priority_rules(recommend_movies_after_first_choice(rated_movies), user_id)
    elif len(rated_movies) < 20:
        return apply_priority_rules(genre_collab_hybrid(user_id), user_id)
    else:
        if user_id in user_id_to_index:
            print(f"Пользователь ID: {user_id} найден в SVD. Используется гибрид SVD + genre + RF.")
            return SVD_genre_rf_hybrid(user_id)
        else:
            print(f"Пользователь {user_id} не найден в SVD. Используется genre_collab_hybrid.")
            return genre_collab_hybrid(user_id)


def create_tables(n=999999, k=999999, force_recompute=False):
    drive.mount('/content/drive')

    tags = pd.read_csv('/content/drive/MyDrive/ml-latest/genome-tags.csv', low_memory=False)

    movie_tags = pd.read_csv('/content/drive/MyDrive/ml-latest/genome-scores.csv', low_memory=False)
    movie_tags['relevance'] = movie_tags['relevance'].astype(np.float32)
    movie_tags = movie_tags[movie_tags['movieId'] < n]
    movie_tags = pd.merge(movie_tags, tags, on='tagId', how='left')

    movie_details = pd.read_csv('/content/drive/MyDrive/ml-latest/movies.csv', low_memory=False)
    movie_details = movie_details[movie_details['movieId'] < n]
    movie_details['genres'] = movie_details['genres'].str.replace(' ', '', regex=False)
    movie_details['genres'] = movie_details['genres'].str.replace('|', ' ', regex=False)
    movie_details['first_genre'] = movie_details['genres'].str.split().str[0]
    movie_details = movie_details[['movieId', 'title', 'genres', 'first_genre']]

    ratings = pd.read_csv('/content/drive/MyDrive/ml-latest/ratings.csv', low_memory=False)
    #ratings['date'] = pd.to_datetime(ratings['timestamp'], unit='s').dt.strftime('%Y-%m-%d')
    ratings = ratings[ratings['movieId'] < n]
    ratings = ratings[ratings['userId'] < k]

    movie_ratings = ratings.groupby('movieId').agg(
        average_rating=('rating', 'mean'),
        vote_count=('rating', 'count')
    ).reset_index()
    movie_ratings = pd.merge(movie_details[['movieId', 'title']], movie_ratings, on='movieId', how='inner')
    movie_ratings['flag'] = 0

    # пока не используется

    #movies = pd.read_csv('/content/drive/MyDrive/ml-latest/links.csv', low_memory=False)
    #movies = movies[movies['movieId'] < n]

    #user_tags = pd.read_csv('/content/drive/MyDrive/ml-latest/tags.csv', low_memory=False)
    ##user_tags['date'] = pd.to_datetime(user_tags['timestamp'], unit='s').dt.strftime('%Y-%m-%d')
    #user_tags = user_tags[user_tags['movieId'] < n]
    #user_tags = user_tags[user_tags['userId'] < k]

    return tags, movie_tags, movie_details, ratings, movie_ratings #, user_tags, movies

def compute_svd_with_cache(cache_path='/content/drive/MyDrive/ml-latest/svd_cache.pkl', n_components=50, force_recompute=False):
    if os.path.exists(cache_path) and not force_recompute:
        print("Загружаем SVD из кеша...")
        with open(cache_path, 'rb') as f:
            svd_data = pickle.load(f)
        return svd_data

    print("Кеш не найден. Выполняем SVD...")
    user_rating_counts = ratings.groupby('userId').size()
    filtered_ratings = ratings[ratings['userId'].isin(user_rating_counts[user_rating_counts >= 20].index)].copy()

    movie_ids = filtered_ratings['movieId'].unique()
    user_ids = filtered_ratings['userId'].unique()

    movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    index_to_user_id = {idx: user_id for user_id, idx in user_id_to_index.items()}

    filtered_ratings['movie_idx'] = filtered_ratings['movieId'].map(movie_id_to_index)
    filtered_ratings['user_idx'] = filtered_ratings['userId'].map(user_id_to_index)

    user_means = filtered_ratings.groupby('user_idx')['rating'].mean().round(5)
    filtered_ratings['normalized_rating'] = filtered_ratings.apply(
        lambda row: row['rating'] - user_means[row['user_idx']], axis=1
    )

    ratings_sparse_normalized = coo_matrix((
        filtered_ratings['normalized_rating'],
        (filtered_ratings['movie_idx'], filtered_ratings['user_idx'])
    ))

    U, sigma_values, Vt = svds(ratings_sparse_normalized, k=n_components)
    sigma = np.diag(sigma_values)

    svd_data = {
        'U': U,
        'sigma': sigma,
        'Vt': Vt,
        'user_means': user_means,
        'movie_id_to_index': movie_id_to_index,
        'user_id_to_index': user_id_to_index,
        'index_to_user_id': index_to_user_id,
        'movie_ids': movie_ids,
        'filtered_ratings': filtered_ratings
    }

    with open(cache_path, 'wb') as f:
        pickle.dump(svd_data, f)

    return svd_data

def add_user_ratings(new_user=True, user_id=None, movie_ids=None, ratings_list=None):
    global ratings

    if type(movie_ids) == int:
        movie_ids = [movie_ids]

    if type(ratings_list) == int:
        ratings_list = float(ratings_list)
        ratings_list = [ratings_list]

    if type(ratings_list) == float:
        ratings_list = [ratings_list]

    if new_user:
        user_id = ratings['userId'].max() + 1
        print(f"Создан новый пользователь с ID: {user_id}")

    if user_id is None:
        raise ValueError("user_id должен быть указан либо через new_user=True, либо вручную.")

    if movie_ids is None or ratings_list is None or len(movie_ids) != len(ratings_list):
        raise ValueError("Нужно указать movie_ids и ratings_list одной длины.")

    timestamp = int(time.time())
    new_rows = pd.DataFrame({
        'userId': [user_id]*len(movie_ids),
        'movieId': movie_ids,
        'rating': ratings_list,
        'timestamp': [timestamp + i for i in range(len(movie_ids))]
    })

    ratings = pd.concat([ratings, new_rows], ignore_index=True)
    print(f"Добавлено {len(movie_ids)} новых оценок для пользователя {user_id}")

    return user_id

In [6]:
tags, movie_tags, movie_details, ratings, movie_ratings = create_tables()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
svd_data = compute_svd_with_cache(n_components=50, force_recompute=False)

U = svd_data['U']
sigma = svd_data['sigma']
Vt = svd_data['Vt']
user_means = svd_data['user_means']
movie_id_to_index = svd_data['movie_id_to_index']
user_id_to_index = svd_data['user_id_to_index']
index_to_user_id = svd_data['index_to_user_id']
movie_ids = svd_data['movie_ids']
filtered_ratings = svd_data['filtered_ratings']

Загружаем SVD из кеша...


In [8]:
priority(delete=True, movie_id=364)

Unnamed: 0,movieId,label
0,8,1
1,16,1
2,32,-1
3,296,-1


In [9]:
force_recompute = False

# @title Pearson correlation matrix

cache_path = '/content/drive/MyDrive/ml-latest/movie_corr.parquet'

top_movie_ids = movie_tags.groupby('movieId')['relevance'].sum().nlargest(5000).index
movie_tags_for_pivot = movie_tags[movie_tags['movieId'].isin(top_movie_ids)]
if os.path.exists(cache_path) and not force_recompute:
    print("Загружаем кэшированную матрицу сходства фильмов...")
    movie_correlation_matrix = pd.read_parquet(cache_path)
else:
    print("Рассчитываем матрицу сходства фильмов...")
    pivot_df = movie_tags_for_pivot.pivot(index='tagId', columns='movieId', values='relevance').fillna(0)
    similarity = cosine_similarity(pivot_df.T)
    movie_correlation_matrix = pd.DataFrame(similarity, index=pivot_df.columns, columns=pivot_df.columns).round(5)
    movie_correlation_matrix.to_parquet(cache_path)
    print(f"Матрица сохранена в {cache_path}")

Загружаем кэшированную матрицу сходства фильмов...


In [10]:
train_df = build_training_dataset(num_users=700)

Processing user 211382
Processing user 284909
Processing user 317736
Processing user 313165
Processing user 44546
Processing user 198282
Processing user 274029
Processing user 205787
Processing user 2839
Processing user 57313
Processing user 65672
Processing user 230947
Processing user 246968
Processing user 156136
Processing user 197901
Processing user 23130
Processing user 274147
Processing user 182081
Processing user 158098
Processing user 71548
Processing user 192387
Processing user 203623
Processing user 142607
Processing user 272324
Processing user 202040
Processing user 223360
Processing user 311689
Processing user 175888
Processing user 103801
Processing user 240772
Processing user 465
Processing user 281046
Processing user 123231
Processing user 267593
Processing user 88355
Processing user 260838
Processing user 287345
Processing user 263418
Processing user 294469
Processing user 25818
Processing user 127404
Processing user 132709
Processing user 61555
Processing user 299244
P

In [11]:
rf_model, feature_cols = train_random_forest_model(train_df)

Accuracy: 99.83%


In [None]:
# @title ALS

cache_model_path = '/content/drive/MyDrive/ml-latest/als_model.pkl'

force_recompute = False

if os.path.exists(cache_model_path) and not force_recompute:
    print("Загружаем кэшированную ALS модель...")

    als_ratings = csr_matrix((
        filtered_ratings['rating'],
        (filtered_ratings['movie_idx'], filtered_ratings['user_idx'])
    ))

    als_model = joblib.load(cache_model_path)

else:
    print("Рассчитываем ALS...")

    als_ratings = csr_matrix((
        filtered_ratings['rating'],
        (filtered_ratings['movie_idx'], filtered_ratings['user_idx'])
    ))

    als_model = AlternatingLeastSquares(
        factors=50,
        regularization=0.01,
        iterations=20,
        use_gpu=False,
    )
    als_model.fit(als_ratings)

    joblib.dump(als_model, cache_model_path)

    print(f"Модель сохранена в {cache_model_path}")

Загружаем кэшированную ALS модель...


# Final

In [9]:
movie_sample = np.random.choice(range(1, 100), size=25, replace=False)
ratings_sample = np.random.uniform(2.5, 5.0, size=25).round(0).tolist()
print(movie_sample)
print(ratings_sample)

[74 28 81  9 44 63 15  6 51 70 16 59 45 76 72 69  4 67  3 77 39 55 13 33
 83]
[3.0, 4.0, 4.0, 3.0, 4.0, 5.0, 5.0, 4.0, 3.0, 4.0, 4.0, 5.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0]


In [39]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
...,...,...,...,...
33832157,330975,8340,2.0,1091583256
33832158,330975,8493,2.5,1091585709
33832159,330975,8622,4.0,1091581777
33832160,330975,8665,3.0,1091581765


In [17]:
ratings.drop(ratings[ratings['userId'] == 330976].index, inplace=True)

In [None]:
new_user_id = add_user_ratings(new_user=True, movie_ids=movie_sample, ratings_list=ratings_sample)

In [36]:
len(ratings[ratings['userId'] == 15])

44

In [37]:
user1 = ratings[ratings['userId'] == 15]
user1 = pd.merge(user1, movie_details[['movieId', 'title']], on='movieId', how='inner')
user1

Unnamed: 0,userId,movieId,rating,timestamp,title
0,15,16,4.5,1463468527,Casino (1995)
1,15,50,4.0,1463468274,"Usual Suspects, The (1995)"
2,15,223,3.5,1463468531,Clerks (1994)
3,15,260,2.5,1463468357,Star Wars: Episode IV - A New Hope (1977)
4,15,296,4.5,1463468298,Pulp Fiction (1994)
5,15,318,3.5,1463468271,"Shawshank Redemption, The (1994)"
6,15,527,3.0,1463468278,Schindler's List (1993)
7,15,593,3.5,1463468284,"Silence of the Lambs, The (1991)"
8,15,750,3.5,1463468417,Dr. Strangelove or: How I Learned to Stop Worr...
9,15,778,4.5,1463468519,Trainspotting (1996)


In [38]:
recommend_for_user(15)

Пользователь ID: 15 найден в SVD. Используется гибрид SVD + genre + RF.




Unnamed: 0,movieId,final_score,title
0,6934,1.728546,"Matrix Revolutions, The (2003)"
1,6953,1.700289,21 Grams (2003)
2,207313,1.697945,Knives Out (2019)
3,1732,1.57379,"Big Lebowski, The (1998)"
4,589,1.571615,Terminator 2: Judgment Day (1991)
5,6,1.56978,Heat (1995)
6,111,1.562855,Taxi Driver (1976)
7,6365,1.560791,"Matrix Reloaded, The (2003)"
8,6874,1.556393,Kill Bill: Vol. 1 (2003)
9,4011,1.549387,Snatch (2000)


# Test

In [15]:
def time_based_train_test_split(ratings_df, test_size=0.2):
    ratings_sorted = ratings_df.sort_values('timestamp')
    cutoff_index = int((1 - test_size) * len(ratings_sorted))
    ratings_train = ratings_sorted.iloc[:cutoff_index]
    ratings_test = ratings_sorted.iloc[cutoff_index:]
    return ratings_train.reset_index(drop=True), ratings_test.reset_index(drop=True)

def evaluate_recommender(recommend_func, ratings_train, ratings_test, k=10, user_sample_size=200, min_test_ratings=5):
    user_groups = ratings_test.groupby('userId')

    eligible_users = [user_id for user_id, group in user_groups if len(group) >= min_test_ratings]

    if user_sample_size is not None and len(eligible_users) > user_sample_size:
        sampled_users = random.sample(eligible_users, user_sample_size)
    else:
        sampled_users = eligible_users

    precision_scores = []
    recall_scores = []

    for user_id in sampled_users:
        true_items = set(ratings_test[(ratings_test['userId'] == user_id) & (ratings_test['rating'] >= 4)]['movieId'])
        if len(true_items) == 0:
            continue

        try:
            recommendations = recommend_func(user_id)
            recommended_items = set(recommendations.head(k)['movieId'])

            true_positives = recommended_items & true_items
            precision = len(true_positives) / k
            recall = len(true_positives) / len(true_items)

            precision_scores.append(precision)
            recall_scores.append(recall)
        except:
            continue

    return {
        'Precision@k': round(np.mean(precision_scores), 4),
        'Recall@k': round(np.mean(recall_scores), 4),
        'Users tested': len(precision_scores)
    }

def run_all_evaluations(ratings_df, k=10):
    ratings_train, ratings_test = time_based_train_test_split(ratings_df)

    global ratings
    original_ratings = ratings.copy()
    ratings = ratings_train

    results = {}

    print("Оцениваем: SVD + жанры + RF (гибрид)")
    results['SVD_genre_rf_hybrid'] = evaluate_recommender(SVD_genre_rf_hybrid, ratings_train, ratings_test, k)

    print("Оцениваем: жанрово-коллаборативный гибрид")
    results['genre_collab_hybrid'] = evaluate_recommender(genre_collab_hybrid, ratings_train, ratings_test, k)

    print("Оцениваем: просто жанровая модель")
    results['get_genre_recommended_movies'] = evaluate_recommender(get_genre_recommended_movies, ratings_train, ratings_test, k)

    ratings = original_ratings

    return pd.DataFrame(results).T

In [16]:
evaluation_df = run_all_evaluations(ratings, k=10)
print(evaluation_df)

Оцениваем: SVD + жанры + RF (гибрид)




User 42461 not found in filtered dataset.




User 186739 not found in filtered dataset.
User 257788 not found in filtered dataset.




User 201840 not found in filtered dataset.




User 257326 not found in filtered dataset.




User 199812 not found in filtered dataset.




User 40656 not found in filtered dataset.




User 268992 not found in filtered dataset.




User 132728 not found in filtered dataset.
User 325729 not found in filtered dataset.




User 5444 not found in filtered dataset.




User 266373 not found in filtered dataset.
User 118694 not found in filtered dataset.




User 27866 not found in filtered dataset.




User 103131 not found in filtered dataset.
User 217460 not found in filtered dataset.
User 238236 not found in filtered dataset.




User 220313 not found in filtered dataset.




User 208811 not found in filtered dataset.




User 59402 not found in filtered dataset.
User 314065 not found in filtered dataset.




User 102262 not found in filtered dataset.
User 198122 not found in filtered dataset.




User 89944 not found in filtered dataset.




User 261865 not found in filtered dataset.




User 149741 not found in filtered dataset.




User 247691 not found in filtered dataset.




User 139148 not found in filtered dataset.




User 69586 not found in filtered dataset.




User 145394 not found in filtered dataset.




User 49539 not found in filtered dataset.




User 123720 not found in filtered dataset.




User 298745 not found in filtered dataset.




User 3866 not found in filtered dataset.
User 179818 not found in filtered dataset.




User 165346 not found in filtered dataset.
User 116411 not found in filtered dataset.




User 294612 not found in filtered dataset.




User 124698 not found in filtered dataset.




User 98150 not found in filtered dataset.
User 208190 not found in filtered dataset.
User 73934 not found in filtered dataset.
User 287243 not found in filtered dataset.
User 1727 not found in filtered dataset.




User 205639 not found in filtered dataset.
User 203238 not found in filtered dataset.
User 117616 not found in filtered dataset.




User 205260 not found in filtered dataset.




User 222502 not found in filtered dataset.




User 149883 not found in filtered dataset.




User 166799 not found in filtered dataset.
User 224417 not found in filtered dataset.




User 314831 not found in filtered dataset.




User 192706 not found in filtered dataset.
User 289767 not found in filtered dataset.




User 9291 not found in filtered dataset.




User 61840 not found in filtered dataset.
User 225675 not found in filtered dataset.




User 291746 not found in filtered dataset.
Оцениваем: жанрово-коллаборативный гибрид
Оцениваем: просто жанровая модель
                              Precision@k  Recall@k  Users tested
SVD_genre_rf_hybrid                0.0000    0.0000         141.0
genre_collab_hybrid                0.0629    0.0156          35.0
get_genre_recommended_movies       0.0240    0.0019         200.0


In [None]:
top_IMDb_score()

Unnamed: 0,movieId,title,first_genre,vote_count,average_rating,score
0,318,"Shawshank Redemption, The (1994)",Crime,122296,4.416792,4.309984
1,858,"Godfather, The (1972)",Crime,75004,4.326603,4.171094
2,50,"Usual Suspects, The (1995)",Crime,72893,4.267865,4.115529
3,527,Schindler's List (1993),Drama,84232,4.242337,4.111043
4,2959,Fight Club (1999),Action,86207,4.236019,4.10808
5,296,Pulp Fiction (1994),Comedy,108756,4.191778,4.091866
6,2571,"Matrix, The (1999)",Action,107056,4.160631,4.061946
7,1221,"Godfather: Part II, The (1974)",Crime,47271,4.26951,4.048814
8,593,"Silence of the Lambs, The (1991)",Crime,101802,4.150287,4.047899
9,58559,"Dark Knight, The (2008)",Action,65349,4.187539,4.030689


In [None]:
top_in_genre_IMDb_score()

Unnamed: 0,movieId,title,first_genre,vote_count,average_rating,score
0,318,"Shawshank Redemption, The (1994)",Crime,122296,4.416792,4.309984
1,527,Schindler's List (1993),Drama,84232,4.242337,4.111043
2,2959,Fight Club (1999),Action,86207,4.236019,4.10808
3,296,Pulp Fiction (1994),Comedy,108756,4.191778,4.091866
4,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure,79940,4.099331,3.977774
5,4226,Memento (2000),Mystery,55649,4.143713,3.970426
6,109487,Interstellar (2014),Sci-Fi,40603,4.146972,3.921583
7,1214,Alien (1979),Horror,46572,4.069505,3.88164
8,457,"Fugitive, The (1993)",Thriller,61732,3.976876,3.841666
9,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation,18597,4.111201,3.725353


In [None]:
top_to_parametr(260)

Unnamed: 0,movieId,title,relevance
0,1196,Star Wars: Episode V - The Empire Strikes Back...,0.97113
1,1210,Star Wars: Episode VI - Return of the Jedi (1983),0.96792
2,122886,Star Wars: Episode VII - The Force Awakens (2015),0.90718
3,1198,Raiders of the Lost Ark (Indiana Jones and the...,0.89863
4,166528,Rogue One: A Star Wars Story (2016),0.89616
5,1200,Aliens (1986),0.89116
6,33493,Star Wars: Episode III - Revenge of the Sith (...,0.88665
7,480,Jurassic Park (1993),0.88403
8,2628,Star Wars: Episode I - The Phantom Menace (1999),0.87765
9,1270,Back to the Future (1985),0.87325


In [None]:
recommend_movies_after_first_choice(260)

Unnamed: 0,movieId,title
0,318,"Shawshank Redemption, The (1994)"
1,527,Schindler's List (1993)
2,2959,Fight Club (1999)
3,296,Pulp Fiction (1994)
4,109487,Interstellar (2014)
5,1148,Wallace & Gromit: The Wrong Trousers (1993)
6,1198,Raiders of the Lost Ark (Indiana Jones and the...
7,166528,Rogue One: A Star Wars Story (2016)
8,1210,Star Wars: Episode VI - Return of the Jedi (1983)
9,4993,"Lord of the Rings: The Fellowship of the Ring,..."


In [None]:
get_top_n_recommendations(1)

Top 10 recommended movieIds for user 1:


Unnamed: 0,movieId,title,predicted_rating
842,1198,Raiders of the Lost Ark (Indiana Jones and the...,0.395715
2063,3147,"Green Mile, The (1999)",0.258113
4020,6377,Finding Nemo (2003),0.227251
37,47,Seven (a.k.a. Se7en) (1995),0.208475
2620,4022,Cast Away (2000),0.159786
3189,4963,Ocean's Eleven (2001),0.153115
417,608,Fargo (1996),0.146575
3786,5989,Catch Me If You Can (2002),0.141054
1410,2115,Indiana Jones and the Temple of Doom (1984),0.12715
2466,3753,"Patriot, The (2000)",0.117596


In [None]:
get_als_recommendations(1)

Unnamed: 0,movieId,title,predicted_rating
0,5042,Forbidden Zone (1980),1.622929
1,7039,Thunderheart (1992),1.560926
2,8522,My Little Chickadee (1940),1.554901
3,41769,Mozart and the Whale (2005),1.557553
4,109243,Joe (2013),1.521112
5,171339,Bear Island (1979),1.874038
6,177331,Ping Pong (2002),1.495003
7,185107,Schubert in Love (2016),1.497572
8,201298,Karmouz War (2018),1.583997
9,271793,Badhaai Do (2022),1.527865


In [None]:
hybrid_user_recommendations(1)

Top 10 recommended movieIds for user 1:


Unnamed: 0,movieId,title_svd,predicted_rating_svd,title_als,predicted_rating_als,predicted_rating
0,1198,Raiders of the Lost Ark (Indiana Jones and the...,0.395715,Raiders of the Lost Ark (Indiana Jones and the...,1.207861,0.476929
1,3147,"Green Mile, The (1999)",0.258113,"Green Mile, The (1999)",0.580509,0.290353
2,6377,Finding Nemo (2003),0.227251,Finding Nemo (2003),0.534494,0.257975
4,608,Fargo (1996),0.146575,Fargo (1996),0.774279,0.209345
5,2115,Indiana Jones and the Temple of Doom (1984),0.12715,Indiana Jones and the Temple of Doom (1984),0.910411,0.205476
3,4022,Cast Away (2000),0.159786,Cast Away (2000),0.594721,0.203279
7185,171339,Bear Island (1979),1.7e-05,Bear Island (1979),1.874038,0.187419
12,288,Natural Born Killers (1994),0.069481,Natural Born Killers (1994),1.190148,0.181548
14,1923,There's Something About Mary (1998),0.064988,There's Something About Mary (1998),1.1833,0.176819
7,364,"Lion King, The (1994)",0.10945,"Lion King, The (1994)",0.766488,0.175154


In [None]:
get_genre_recommended_movies(1)

Unnamed: 0,movieId,title,genre_similarity
4615,4719,Osmosis Jones (2001),0.68313
2895,2987,Who Framed Roger Rabbit? (1988),0.68313
1818,1907,Mulan (1998),0.68313
454,459,"Getaway, The (1994)",0.632456
4851,4956,"Stunt Man, The (1980)",0.632456
2526,2617,"Mummy, The (1999)",0.632456
2323,2414,Young Sherlock Holmes (1985),0.632456
196,198,Strange Days (1995),0.632456
661,673,Space Jam (1996),0.632456
540,546,Super Mario Bros. (1993),0.632456


In [None]:
genre_collab_hybrid(1)

Unnamed: 0,movieId,title,final_score
56,3114,Toy Story 2 (1999),0.743995
0,1,Toy Story (1995),0.47321
39,2355,"Bug's Life, A (1998)",0.465415
38,2000,Lethal Weapon (1987),0.4647
67,4306,Shrek (2001),0.45677
3,318,"Shawshank Redemption, The (1994)",0.45351
34,1704,Good Will Hunting (1997),0.451705
57,3147,"Green Mile, The (1999)",0.44693
65,4187,Lilies of the Field (1963),0.44372
4,377,Speed (1994),0.441625


In [None]:
movie_tags

Unnamed: 0,movieId,tagId,relevance,tag
0,1,1,0.03200,007
1,1,2,0.02225,007 (series)
2,1,3,0.07000,18th century
3,1,4,0.05900,1920s
4,1,5,0.12300,1930s
...,...,...,...,...
5639995,288167,1124,0.09875,writing
5639996,288167,1125,0.02950,wuxia
5639997,288167,1126,0.02275,wwii
5639998,288167,1127,0.11225,zombie


In [None]:
tags

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
...,...,...
1123,1124,writing
1124,1125,wuxia
1125,1126,wwii
1126,1127,zombie


In [None]:
movies

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
86532,288967,14418234,845861.0
86533,288971,11162178,878958.0
86534,288975,70199,150392.0
86535,288977,23050520,1102551.0


In [None]:
movie_details

Unnamed: 0,movieId,title,genres,first_genre,genres_list,genre_similarity
1,2,Jumanji (1995),Adventure Children Fantasy,Adventure,"[Adventure, Children, Fantasy]",0.447214
2,3,Grumpier Old Men (1995),Comedy Romance,Comedy,"[Comedy, Romance]",0.365148
3,4,Waiting to Exhale (1995),Comedy Drama Romance,Comedy,"[Comedy, Drama, Romance]",0.447214
4,5,Father of the Bride Part II (1995),Comedy,Comedy,[Comedy],0.258199
5,6,Heat (1995),Action Crime Thriller,Action,"[Action, Crime, Thriller]",0.447214
...,...,...,...,...,...,...
4889,4994,"Majestic, The (2001)",Comedy Drama Romance,Comedy,"[Comedy, Drama, Romance]",0.447214
4891,4996,Little Otik (Otesánek) (2000),Comedy Drama Fantasy,Comedy,"[Comedy, Drama, Fantasy]",0.447214
4892,4997,"Convent, The (2000)",Horror Sci-Fi,Horror,"[Horror, Sci-Fi]",0.365148
4893,4998,"Defiant Ones, The (1958)",Adventure Crime Drama Thriller,Adventure,"[Adventure, Crime, Drama, Thriller]",0.516398


In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
...,...,...,...,...
33832157,330975,8340,2.0,1091583256
33832158,330975,8493,2.5,1091585709
33832159,330975,8622,4.0,1091581777
33832160,330975,8665,3.0,1091581765


In [None]:
user_tags

Unnamed: 0,userId,movieId,tag,timestamp
0,10,260,good vs evil,1430666558
1,10,260,Harrison Ford,1430666505
2,10,260,sci-fi,1430666538
3,14,1221,Al Pacino,1311600756
4,14,1221,mafia,1311600746
...,...,...,...,...
2328310,330923,176599,politically correct,1507547491
2328311,330933,3317,coming of age,1351279384
2328312,330933,3317,sexuality,1351279389
2328313,330947,5782,Not Luc Besson,1154110902


In [None]:
movie_ratings

Unnamed: 0,movieId,title,average_rating,vote_count,flag
0,1,Toy Story (1995),3.843670,1177,0
1,2,Jumanji (1995),3.292952,454,0
2,3,Grumpier Old Men (1995),3.120172,233,0
3,4,Waiting to Exhale (1995),3.177419,31,0
4,5,Father of the Bride Part II (1995),3.150826,242,0
...,...,...,...,...,...
4451,4995,"Beautiful Mind, A (2001)",3.887073,673,0
4452,4996,Little Otik (Otesánek) (2000),2.700000,5,0
4453,4997,"Convent, The (2000)",2.500000,1,0
4454,4998,"Defiant Ones, The (1958)",3.666667,6,0
