In [2]:
import sys
import os

# Add the project root to sys.path (not src directly!)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
from src.data_loader import load_movielens_data, preprocess_data
import pandas as pd

In [4]:
raw_data, ratings_df, users_df, items_df = load_movielens_data()

In [5]:
# Show shapes
print("Data Shapes:")
print(f"- Raw merged data: {raw_data.shape}")
print(f"- Ratings data: {ratings_df.shape}")
print(f"- Users data: {users_df.shape}")
print(f"- Items data: {items_df.shape}")

Data Shapes:
- Raw merged data: (100000, 31)
- Ratings data: (100000, 4)
- Users data: (943, 5)
- Items data: (1682, 24)


In [7]:
filtered_ratings = preprocess_data(ratings_df)

In [8]:
print("\nPreprocessing Results:")
print(f"Original ratings: {ratings_df.shape[0]} records")
print(f"Filtered ratings: {filtered_ratings.shape[0]} records")
print(f"Users removed: {ratings_df['user_id_ml'].nunique() - filtered_ratings['user_id_ml'].nunique()}")


Preprocessing Results:
Original ratings: 100000 records
Filtered ratings: 100000 records
Users removed: 0


In [9]:
user_item_matrix = filtered_ratings.pivot_table(
    index='user_id_ml',
    columns='item_id_ml',
    values='rating_ml'
)

In [11]:
user_item_matrix.to_csv(r'C:\Users\PC\Desktop\collaborative-filtering-recommender\results\user_item_matrix.csv')
print("\nUser-Item Matrix Created:")
print(f"Matrix shape: {user_item_matrix.shape}")
print(f"Example matrix values:\n{user_item_matrix.iloc[:5, :5]}")


User-Item Matrix Created:
Matrix shape: (943, 1682)
Example matrix values:
item_id_ml    1    2    3    4    5
user_id_ml                         
1           5.0  3.0  4.0  3.0  3.0
2           4.0  NaN  NaN  NaN  NaN
3           NaN  NaN  NaN  NaN  NaN
4           NaN  NaN  NaN  NaN  NaN
5           4.0  3.0  NaN  NaN  NaN


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
# Split ratings into train/test
train_ratings, test_ratings = train_test_split(
    filtered_ratings,
    test_size=0.2,
    random_state=42,
    stratify=filtered_ratings['user_id_ml']
)

In [18]:
train_ratings.to_csv(r'C:\Users\PC\Desktop\collaborative-filtering-recommender\results\train_ratings.csv', index=False)
test_ratings.to_csv(r'C:\Users\PC\Desktop\collaborative-filtering-recommender\results\test_ratings.csv', index=False)

print("\nTrain-Test Split Created:")
print(f"Training set: {train_ratings.shape[0]} records")
print(f"Test set: {test_ratings.shape[0]} records")


Train-Test Split Created:
Training set: 80000 records
Test set: 20000 records
