In [5]:
import pandas as pd

def process_ratings(ratings_df):
    high_ratings = ratings_df.query('rating > 2')
    user_rating_counts = high_ratings['userId'].value_counts().reset_index()
    user_rating_counts.columns = ['userId', 'rating_count']
    qualified_users = user_rating_counts.loc[user_rating_counts['rating_count'] > 10, 'userId']    
    filtered_ratings = high_ratings[high_ratings['userId'].isin(qualified_users)]
    
    transactional_data = (filtered_ratings.groupby('userId')['movieId']
                          .agg(movies_rated_above_2=lambda x: x.tolist())
                          .reset_index())
    
    return transactional_data


def add_movie_titles(transactional_data, movies_df):
    reshaped_data = transactional_data.set_index('userId')['movies_rated_above_2'].apply(pd.Series).stack().reset_index()
    reshaped_data.columns = ['userId', 'movie_index', 'movieId']
    movies_subset = movies_df[['movieId', 'title']].set_index('movieId')
    joined_data = reshaped_data.join(movies_subset, on='movieId', how='left')
    
    result = (joined_data.groupby('userId')
              .agg({'title': lambda x: x.tolist()})
              .rename(columns={'title': 'movies_rated_above_2'})
              .reset_index())
    
    return result

ratings_path = 'ml-latest-small/ratings.csv'
movies_path = 'ml-latest-small/movies.csv'
ratings_df, movies_df = pd.read_csv(ratings_path), pd.read_csv(movies_path)

transactional_data = process_ratings(ratings_df)
transactional_data_with_titles = add_movie_titles(transactional_data, movies_df)

In [6]:
transactional_data

Unnamed: 0,userId,movies_rated_above_2
0,1,"[1, 3, 6, 47, 50, 70, 101, 110, 151, 157, 163,..."
1,2,"[318, 333, 1704, 3578, 6874, 8798, 46970, 4851..."
2,3,"[849, 1275, 1371, 1587, 2288, 2851, 3024, 3703..."
3,4,"[21, 45, 52, 58, 106, 125, 162, 171, 176, 215,..."
4,5,"[1, 21, 34, 36, 39, 50, 58, 110, 150, 153, 232..."
...,...,...
602,606,"[1, 7, 11, 15, 17, 18, 28, 29, 32, 36, 46, 47,..."
603,607,"[1, 11, 25, 34, 36, 86, 110, 150, 153, 165, 18..."
604,608,"[1, 10, 16, 21, 31, 32, 34, 39, 47, 50, 70, 88..."
605,609,"[1, 10, 110, 116, 137, 150, 161, 185, 208, 231..."


In [7]:
transactional_data_with_titles

Unnamed: 0,userId,movies_rated_above_2
0,1,"[Toy Story (1995), Grumpier Old Men (1995), He..."
1,2,"[Shawshank Redemption, The (1994), Tommy Boy (..."
2,3,"[Escape from L.A. (1996), Highlander (1986), S..."
3,4,"[Get Shorty (1995), To Die For (1995), Mighty ..."
4,5,"[Toy Story (1995), Get Shorty (1995), Babe (19..."
...,...,...
602,606,"[Toy Story (1995), Sabrina (1995), American Pr..."
603,607,"[Toy Story (1995), American President, The (19..."
604,608,"[Toy Story (1995), GoldenEye (1995), Casino (1..."
605,609,"[Toy Story (1995), GoldenEye (1995), Bravehear..."


In [4]:
existing_user_ids = set(transactional_data_with_titles['userId'])

min_user_id = min(existing_user_ids)
max_user_id = max(existing_user_ids)
all_possible_user_ids = set(range(min_user_id, max_user_id + 1))
missing_user_ids = all_possible_user_ids - existing_user_ids

missing_user_ids_sorted = sorted(missing_user_ids)

print(f"Number of missing user IDs: {len(missing_user_ids)}")
print("\nMissing user IDs:")
print(missing_user_ids_sorted)

Number of missing user IDs: 3

Missing user IDs:
[293, 442, 508]


In [10]:
from sklearn.model_selection import train_test_split

def split_dataset(transactional_data, test_size=0.2, random_state=42):
    train_data = []
    test_data = []

    for _, row in transactional_data.iterrows():
        user_id = row['userId']
        movies = row['movies_rated_above_2']
        
        # Ensure we have at least 1 movie for test set
        n_test = max(1, int(len(movies) * test_size))
        train_movies, test_movies = train_test_split(movies, test_size=n_test, random_state=random_state)
        
        train_data.append({'userId': user_id, 'movies_rated_above_2': train_movies})
        test_data.append({'userId': user_id, 'movies_rated_above_2': test_movies})

    train_df = pd.DataFrame(train_data)
    test_df = pd.DataFrame(test_data)

    return train_df, test_df


train_set, test_set = split_dataset(transactional_data)

print("Training set shape:", train_set.shape)
print("Test set shape:", test_set.shape)

Training set shape: (607, 2)
Test set shape: (607, 2)
