# First attmept to model contend based and collaborative recommendations for cold start scenarios
### Running both models on user 1 who has rated only 5 movies.
### This model currently does not include the two models, only the steps to prepare the data

In [None]:
%pip install scikit-learn

In [None]:
%pip install pandas

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

# ============================================
# 1. LOAD ALL DATA FILES
# ============================================

# Load ratings data
ratings = pd.read_csv('ml-100k/u.data', sep='\t', 
                      names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load movie data with genre information
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1',
                     names=['item_id', 'title', 'release_date', 'video_release_date', 'imdb_url',
                            'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
                            'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                            'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

# Load user demographic data (optional, for enhanced models)
users = pd.read_csv('ml-100k/u.user', sep='|',
                    names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

print(f"Total ratings: {len(ratings)}")
print(f"Total movies: {len(movies)}")
print(f"Total users: {len(users)}")
print()

Total ratings: 100000
Total movies: 1682
Total users: 943



In [12]:
# ============================================
# 2. SELECT A SINGLE USER
# ============================================

# Choosing user
selected_user_id = 1

# Get all ratings for this user
user_ratings = ratings[ratings['user_id'] == selected_user_id].copy()

print(f"User {selected_user_id} has rated {len(user_ratings)} movies")
print(f"Rating Distribution (1-worst, 5-best):")
print(user_ratings['rating'].value_counts().sort_index())
print()

User 1 has rated 272 movies
Rating Distribution (1-worst, 5-best):
rating
1    25
2    28
3    56
4    82
5    81
Name: count, dtype: int64



In [13]:
# ============================================
# 3. SPLIT INTO TRAIN AND TEST (COLD START)
# ============================================

# For COLD START: use very few ratings for training
n_train_ratings = 5  # Change this to 3, 5, 10, etc. for different cold start scenarios

# Sort by timestamp to get earliest ratings (simulate user just joined)
user_ratings_sorted = user_ratings.sort_values('timestamp')

# Split: first N ratings = train, rest = test
train_ratings = user_ratings_sorted.head(n_train_ratings)
test_ratings = user_ratings_sorted.iloc[n_train_ratings:]

# Alternative: Random split (uncomment if you want random instead of temporal)
# train_ratings, test_ratings = train_test_split(
#     user_ratings, 
#     train_size=n_train_ratings,  # or use test_size=0.2 for percentage
#     random_state=42
# )

print(f"TRAIN set: {len(train_ratings)} ratings (what model knows)")
print(f"TEST set: {len(test_ratings)} ratings (hidden, to be predicted)")
print()

TRAIN set: 5 ratings (what model knows)
TEST set: 267 ratings (hidden, to be predicted)



In [16]:
# ============================================
# 4. JOIN WITH MOVIE INFORMATION
# ============================================

# Add movie info to training data
train_data = train_ratings.merge(movies, on='item_id', how='left')

# Add movie info to test data
test_data = test_ratings.merge(movies, on='item_id', how='left')

# Optional: Add user demographics
train_data = train_data.merge(users, on='user_id', how='left')
test_data = test_data.merge(users, on='user_id', how='left')

print("Training data sample:")
print("---------------------")
print(train_data[['user_id', 'title', 'rating']].head())
print()

print("Test data sample:")
print("-----------------")
print(test_data[['user_id', 'title', 'rating']].head())
print()

Training data sample:
---------------------
   user_id                                           title  rating
0        1          Monty Python and the Holy Grail (1974)       5
1        1                 Empire Strikes Back, The (1980)       5
2        1                         Jean de Florette (1986)       5
3        1                           Reservoir Dogs (1992)       4
4        1  Manon of the Spring (Manon des sources) (1986)       5

Test data sample:
-----------------
   user_id                           title  rating
0        1       Dead Poets Society (1989)       5
1        1  Godfather: Part II, The (1974)       4
2        1           Godfather, The (1972)       5
3        1       Fifth Element, The (1997)       4
4        1              Postino, Il (1994)       5



In [17]:
# ============================================
# 5. PREPARE FOR MODEL TRAINING
# ============================================

# For CONTENT-BASED: Get genre columns
genre_columns = ['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
                 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Movies the user liked in training (rating >= 4)
liked_movies_train = train_data[train_data['rating'] >= 4]['item_id'].values
print(f"User liked {len(liked_movies_train)} movies in training set (rating >= 4)")

# Movies to predict (all movies in test set)
movies_to_predict = test_data['item_id'].values
print(f"Need to predict ratings for {len(movies_to_predict)} movies")
print()

User liked 5 movies in training set (rating >= 4)
Need to predict ratings for 267 movies



In [18]:
# ============================================
# 6. FOR COLLABORATIVE FILTERING
# ============================================

# Get all OTHER users' ratings (for collaborative filtering)
other_users_ratings = ratings[ratings['user_id'] != selected_user_id]
print(f"Other users' ratings available: {len(other_users_ratings)}")

Other users' ratings available: 99728


In [19]:
# ============================================
# 7. SUMMARY
# ============================================

print("\n" + "="*50)
print("DATA READY FOR MODEL TRAINING!")
print("="*50)
print(f"Selected User: {selected_user_id}")
print(f"Training Size: {len(train_ratings)} ratings (COLD START)")
print(f"Test Size: {len(test_ratings)} ratings")
print(f"Movies in catalog: {len(movies)}")
print(f"Genre features available: {len(genre_columns)}")


DATA READY FOR MODEL TRAINING!
Selected User: 1
Training Size: 5 ratings (COLD START)
Test Size: 267 ratings
Movies in catalog: 1682
Genre features available: 19
