1. Download and Load MovieLens 100K Dataset

In [1]:
# Step 1: Download MovieLens 100K Dataset from KaggleHub
import kagglehub
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Download dataset
path = kagglehub.dataset_download("prajitdatta/movielens-100k-dataset")
print("📂 Dataset path:", path)

# Dataset files are in this subfolder
base_path = path + "/ml-100k"
ratings_path = base_path + "/u.data"
movies_path = base_path + "/u.item"

# Load data
ratings = pd.read_csv(ratings_path, sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
movies = pd.read_csv(movies_path, sep="|", encoding="latin-1", header=None, usecols=[0, 1], names=["item_id", "title"])


Downloading from https://www.kaggle.com/api/v1/datasets/download/prajitdatta/movielens-100k-dataset?dataset_version_number=1...


100%|██████████| 4.77M/4.77M [00:01<00:00, 4.90MB/s]

Extracting files...





📂 Dataset path: /root/.cache/kagglehub/datasets/prajitdatta/movielens-100k-dataset/versions/1


2. Create User-Item Matrix

In [2]:
# Step 2: Create User-Item Matrix
user_item_matrix = ratings.pivot_table(index="user_id", columns="item_id", values="rating")
user_item_matrix.fillna(0, inplace=True)


3. Split Ratings Data into Train and Test Sets (User-wise)

In [3]:
# Step 3: Split Data into Train and Test
from sklearn.model_selection import train_test_split

# We'll simulate a test split by hiding 20% of each user's ratings
def train_test_split_userwise(df, test_size=0.2):
    train_list = []
    test_list = []
    for user in df["user_id"].unique():
        user_data = df[df["user_id"] == user]
        if len(user_data) >= 5:
            train, test = train_test_split(user_data, test_size=test_size, random_state=42)
            train_list.append(train)
            test_list.append(test)
        else:
            train_list.append(user_data)
    return pd.concat(train_list), pd.concat(test_list)

ratings_train, ratings_test = train_test_split_userwise(ratings)


4. Define Movie Recommendation Function (User-Based Filtering)

In [4]:
# Step 4: Recommendation Function
def recommend_movies(user_id, k=5):
    if user_id not in user_item_matrix.index:
        return []

    user_vector = user_item_matrix.loc[[user_id]]
    similarities = cosine_similarity(user_vector, user_item_matrix)[0]

    # Filter weak similarities
    similarity_threshold = 0.1
    similarities[similarities < similarity_threshold] = 0

    weighted_ratings = np.dot(similarities, user_item_matrix.values)
    sim_sum = np.array([similarities.sum()] * user_item_matrix.shape[1])
    scores = weighted_ratings / np.where(sim_sum == 0, 1e-8, sim_sum)

    # Exclude movies the user has already rated
    user_rated_items = set(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index)
    unseen_items = [i for i in user_item_matrix.columns if i not in user_rated_items]

    movie_scores = [(item, scores[idx]) for idx, item in enumerate(user_item_matrix.columns) if item in unseen_items]
    top_k = sorted(movie_scores, key=lambda x: x[1], reverse=True)[:k]

    return [movies[movies["item_id"] == item]["title"].values[0] for item, _ in top_k]


5. Evaluate the Model using Precision@K

In [5]:
# Step 5: Evaluate Precision@K
def precision_at_k(actual, predicted, k):
    if len(predicted) > k:
        predicted = predicted[:k]
    return len(set(predicted) & set(actual)) / k

# Choose test users with enough test ratings
min_ratings = 5
user_rating_counts = ratings_test["user_id"].value_counts()
eligible_users = user_rating_counts[user_rating_counts >= min_ratings].index.tolist()
test_users = np.random.choice(eligible_users, size=min(100, len(eligible_users)), replace=False)

k = 5
precisions = []

for user_id in test_users:
    actual_movies = ratings_test[ratings_test["user_id"] == user_id]["item_id"].tolist()
    recommended_titles = recommend_movies(user_id, k=k)

    # Convert actual item_ids to titles
    actual_titles = movies[movies["item_id"].isin(actual_movies)]["title"].tolist()

    prec = precision_at_k(actual_titles, recommended_titles, k)
    precisions.append(prec)

print(f"📈 Average Precision@{k}: {np.mean(precisions):.2f}")


📈 Average Precision@5: 0.00


6. Try Recommendations for a Sample User

In [11]:
# Optional: Try it for one user
sample_user = test_users[0]
print(f"🔍 Recommendations for User {sample_user}:")
for title in recommend_movies(sample_user):
    print("✔️", title)


🔍 Recommendations for User 783:
✔️ Titanic (1997)
✔️ L.A. Confidential (1997)
✔️ Saint, The (1997)
✔️ Good Will Hunting (1997)
✔️ Star Wars (1977)


7. Build Streamlit App Interface for Recommendations

In [12]:
%%writefile movie_recommender_app.py
import streamlit as st
import pandas as pd
import numpy as np
import kagglehub
from sklearn.metrics.pairwise import cosine_similarity

st.set_page_config(page_title="🎬 Movie Recommender", layout="centered")

st.title("🎥 Movie Recommendation System")
st.markdown("Recommend movies to a user based on user similarity (Collaborative Filtering).")

# 📦 Download MovieLens 100K dataset
@st.cache_data
def load_data():
    path = kagglehub.dataset_download("prajitdatta/movielens-100k-dataset")
    base_path = path + "/ml-100k"
    ratings_path = base_path + "/u.data"
    movies_path = base_path + "/u.item"

    ratings = pd.read_csv(ratings_path, sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
    movies = pd.read_csv(movies_path, sep="|", encoding="latin-1", header=None, usecols=[0, 1], names=["item_id", "title"])
    return ratings, movies

ratings, movies = load_data()

# 📊 Create User-Item Matrix
user_item_matrix = ratings.pivot_table(index="user_id", columns="item_id", values="rating").fillna(0)

# 🔍 Recommendation Function
def recommend_movies(user_id, k=5):
    if user_id not in user_item_matrix.index:
        return []

    user_vector = user_item_matrix.loc[[user_id]]
    similarities = cosine_similarity(user_vector, user_item_matrix)[0]

    # Filter weak similarities
    similarity_threshold = 0.1
    similarities[similarities < similarity_threshold] = 0

    weighted_ratings = np.dot(similarities, user_item_matrix.values)
    sim_sum = np.array([similarities.sum()] * user_item_matrix.shape[1])
    scores = weighted_ratings / np.where(sim_sum == 0, 1e-8, sim_sum)

    user_rated_items = set(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index)
    unseen_items = [i for i in user_item_matrix.columns if i not in user_rated_items]

    movie_scores = [(item, scores[idx]) for idx, item in enumerate(user_item_matrix.columns) if item in unseen_items]
    top_k = sorted(movie_scores, key=lambda x: x[1], reverse=True)[:k]

    return [movies[movies["item_id"] == item]["title"].values[0] for item, _ in top_k]

# 🎛️ User Input
user_ids = sorted(ratings["user_id"].unique())
selected_user = st.selectbox("Select a User ID:", user_ids)
top_k = st.slider("How many movies to recommend?", 1, 20, 5)

# 🎬 Show Recommendations
if st.button("Get Recommendations"):
    recommendations = recommend_movies(selected_user, k=top_k)
    if recommendations:
        st.success("Top Recommended Movies:")
        for idx, title in enumerate(recommendations, start=1):
            st.write(f"{idx}. {title}")
    else:
        st.warning("No recommendations available.")


Writing movie_recommender_app.py


8. Install Dependencies and Authenticate Ngrok

In [17]:
# Install required libraries
!pip install -q streamlit kagglehub pyngrok

# Authenticate with your Ngrok token (replace with your real token!)
!ngrok config add-authtoken YOUR_TOKEN_HERE


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


9. Launch Streamlit App with Ngrok Tunnel

In [18]:
# Launch Streamlit app in background
!streamlit run movie_recommender_app.py &>/content/logs.txt &

# Create a tunnel using options dictionary (recommended for latest ngrok)
from pyngrok import ngrok

ngrok.kill()  # Kill previous tunnels
public_url = ngrok.connect(addr=8501, proto="http")  # Specify addr explicitly
print("🌍 App is live at:", public_url)


🌍 App is live at: NgrokTunnel: "https://b481305b25b1.ngrok-free.app" -> "http://localhost:8501"
