<a href="https://colab.research.google.com/github/FatemeFazlali/RecommenderSystemtryout1/blob/main/FedshiMachineLearningAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Mount Google Drive and set up paths
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import tensorflow as tf
from pathlib import Path
import os

# Set up paths
BASE = Path("/content/drive/MyDrive/Fedshi_Machine_Learning")
DATA_DIR = BASE / "data"
BOOKS_FILE = DATA_DIR / "Books.csv"
USERS_FILE = DATA_DIR / "Users.csv"
RATINGS_FILE = DATA_DIR / "Ratings.csv"

# Create processed data directory
PROCESSED_DIR = BASE / "processed_data"
MODELS_DIR = BASE / "models"
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

# Load data
books_df = pd.read_csv(BOOKS_FILE)
users_df = pd.read_csv(USERS_FILE)
ratings_df = pd.read_csv(RATINGS_FILE)

print(f"Books shape: {books_df.shape}")
print(f"Users shape: {users_df.shape}")
print(f"Ratings shape: {ratings_df.shape}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  books_df = pd.read_csv(BOOKS_FILE)


Books shape: (271360, 8)
Users shape: (278858, 3)
Ratings shape: (1149780, 3)


In [15]:
# Enhanced preprocessing functions
def preprocess_books(books_df):
    # Handle missing values
    books_df = books_df.dropna(subset=['ISBN', 'Book-Title', 'Book-Author'])

    # Clean year of publication
    books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')
    books_df = books_df[books_df['Year-Of-Publication'].between(1900, 2023)]

    # Extract publisher information
    publisher_counts = books_df['Publisher'].value_counts()
    books_df['Publisher'] = books_df['Publisher'].apply(
        lambda x: x if publisher_counts.get(x, 0) > 10 else 'Other'
    )

    # Create item index
    books_df['item_index'] = books_df.reset_index().index

    # Create publisher codes
    unique_publishers = books_df['Publisher'].unique()
    publisher_map = {publisher: i for i, publisher in enumerate(unique_publishers)}
    books_df['publisher_code'] = books_df['Publisher'].map(publisher_map)

    # Normalize year
    books_df['year_normalized'] = (books_df['Year-Of-Publication'] - 1900) / (2023 - 1900)

    return books_df

def preprocess_users(users_df):
    # Handle missing ages
    users_df['Age'] = users_df['Age'].fillna(users_df['Age'].median())
    users_df['Age'] = users_df['Age'].clip(5, 90)

    # Extract location components
    users_df[['City', 'State', 'Country']] = users_df['Location'].str.split(',', expand=True).iloc[:, :3]
    users_df['Country'] = users_df['Country'].str.strip().fillna('Unknown')

    # One-hot encode country
    top_countries = users_df['Country'].value_counts().head(10).index
    users_df['Country'] = users_df['Country'].apply(lambda x: x if x in top_countries else 'Other')

    # Create user index
    users_df['user_index'] = users_df.reset_index().index

    # Create country codes
    unique_countries = users_df['Country'].unique()
    country_map = {country: i for i, country in enumerate(unique_countries)}
    users_df['country_code'] = users_df['Country'].map(country_map)

    return users_df

def preprocess_ratings(ratings_df, books_df, users_df):
    # Filter ratings for existing books and users
    ratings_df = ratings_df[ratings_df['ISBN'].isin(books_df['ISBN'])]
    ratings_df = ratings_df[ratings_df['User-ID'].isin(users_df['User-ID'])]

    # Convert ratings to implicit feedback (1 if rating > 0, else 0)
    ratings_df['Interacted'] = (ratings_df['Book-Rating'] > 0).astype(int)

    # Map user and item IDs to indices
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(users_df['User-ID'])}
    item_id_to_index = {isbn: idx for idx, isbn in enumerate(books_df['ISBN'])}

    ratings_df['user_index'] = ratings_df['User-ID'].map(user_id_to_index)
    ratings_df['item_index'] = ratings_df['ISBN'].map(item_id_to_index)

    return ratings_df

# Apply preprocessing
books_df = preprocess_books(books_df)
users_df = preprocess_users(users_df)
ratings_df = preprocess_ratings(ratings_df, books_df, users_df)

# Save processed data
books_df.to_csv(PROCESSED_DIR / 'books_processed.csv', index=False)
users_df.to_csv(PROCESSED_DIR / 'users_processed.csv', index=False)
ratings_df.to_csv(PROCESSED_DIR / 'ratings_processed.csv', index=False)

print("Data preprocessing completed and saved!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')


Data preprocessing completed and saved!


In [16]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

def build_hybrid_recommendation_model(n_users, n_items, n_countries, n_publishers, embedding_dim=50):
    # User inputs
    user_input = Input(shape=(1,), name='user_input')
    user_embedding = Embedding(n_users, embedding_dim, name='user_embedding')(user_input)
    user_flatten = Flatten()(user_embedding)

    # User metadata
    country_input = Input(shape=(1,), name='country_input')
    country_embedding = Embedding(n_countries, 10, name='country_embedding')(country_input)
    country_flatten = Flatten()(country_embedding)

    age_input = Input(shape=(1,), name='age_input')
    age_dense = Dense(5, activation='relu')(age_input)

    # Item inputs
    item_input = Input(shape=(1,), name='item_input')
    item_embedding = Embedding(n_items, embedding_dim, name='item_embedding')(item_input)
    item_flatten = Flatten()(item_embedding)

    # Item metadata
    publisher_input = Input(shape=(1,), name='publisher_input')
    publisher_embedding = Embedding(n_publishers, 10, name='publisher_embedding')(publisher_input)
    publisher_flatten = Flatten()(publisher_embedding)

    year_input = Input(shape=(1,), name='year_input')
    year_dense = Dense(5, activation='relu')(year_input)

    # Concatenate all features
    user_features = Concatenate()([user_flatten, country_flatten, age_dense])
    item_features = Concatenate()([item_flatten, publisher_flatten, year_dense])

    # Dot product of user and item features
    dot_product = tf.keras.layers.Dot(axes=1)([user_features, item_features])

    # Final prediction
    output = Dense(1, activation='sigmoid', name='output')(dot_product)

    # Build model
    model = Model(
        inputs=[user_input, country_input, age_input, item_input, publisher_input, year_input],
        outputs=output
    )

    return model

# Prepare data for training - FIXED VERSION
# Create mapping dictionaries
user_id_to_index = {user_id: idx for idx, user_id in enumerate(users_df['User-ID'])}
isbn_to_index = {isbn: idx for idx, isbn in enumerate(books_df['ISBN'])}

# Map ratings to indices
ratings_df['user_index'] = ratings_df['User-ID'].map(user_id_to_index)
ratings_df['item_index'] = ratings_df['ISBN'].map(isbn_to_index)

# Get user features
user_features = users_df.set_index('User-ID')
ratings_df['country_code'] = ratings_df['User-ID'].map(user_features['country_code'])
ratings_df['age'] = ratings_df['User-ID'].map(user_features['Age'])

# Get book features
book_features = books_df.set_index('ISBN')
ratings_df['publisher_code'] = ratings_df['ISBN'].map(book_features['publisher_code'])
ratings_df['year_normalized'] = ratings_df['ISBN'].map(book_features['year_normalized'])

# Prepare input arrays
user_indices = ratings_df['user_index'].values
item_indices = ratings_df['item_index'].values
country_codes = ratings_df['country_code'].values
ages = ratings_df['age'].values
publisher_codes = ratings_df['publisher_code'].values
years = ratings_df['year_normalized'].values
labels = ratings_df['Interacted'].values

n_users = len(users_df)
n_items = len(books_df)
n_countries = len(users_df['country_code'].unique())
n_publishers = len(books_df['publisher_code'].unique())

# Build and compile model
model = build_hybrid_recommendation_model(n_users, n_items, n_countries, n_publishers)
model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    [user_indices, country_codes, ages, item_indices, publisher_codes, years],
    labels,
    batch_size=1024,
    epochs=10,
    validation_split=0.2,
    verbose=1
)

# Save the model
model.save(MODELS_DIR / 'recommendation_model.h5')
print("Model training completed and saved!")

Epoch 1/10
[1m795/795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m305s[0m 381ms/step - accuracy: 0.5827 - loss: 3.3288 - val_accuracy: 0.6137 - val_loss: 0.6684
Epoch 2/10
[1m795/795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 385ms/step - accuracy: 0.6349 - loss: 0.6453 - val_accuracy: 0.6309 - val_loss: 0.6519
Epoch 3/10
[1m795/795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 388ms/step - accuracy: 0.8104 - loss: 0.4857 - val_accuracy: 0.6187 - val_loss: 0.6582
Epoch 4/10
[1m795/795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 374ms/step - accuracy: 0.9448 - loss: 0.3137 - val_accuracy: 0.6009 - val_loss: 0.6655
Epoch 5/10
[1m795/795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m310s[0m 389ms/step - accuracy: 0.9743 - loss: 0.1922 - val_accuracy: 0.5819 - val_loss: 0.6715
Epoch 6/10
[1m795/795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 373ms/step - accuracy: 0.9854 - loss: 0.1131 - val_accuracy: 0.5842 - val_loss: 0.6704
Epoc



Model training completed and saved!


In [17]:
%%writefile model_utils.py
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

def get_recommendations(user_id, model, books_df, users_df, ratings_df, top_n=10):
    """
    Generate personalized book recommendations for a given user.
    Includes cold start handling for new users.
    """
    try:
        # Check if user exists
        user_data = users_df[users_df['User-ID'] == user_id]

        if user_data.empty:
            # Cold start: user doesn't exist, use popular items
            return handle_cold_start_user(books_df, ratings_df, top_n)

        # Prepare user features for prediction
        user_features = prepare_user_features(user_data)

        # Get all books
        book_features = prepare_book_features(books_df)

        # Generate predictions for all books
        predictions = []
        batch_size = 1000

        for i in range(0, len(book_features), batch_size):
            batch_books = book_features[i:i+batch_size]

            # Create input arrays
            user_input_batch = np.repeat(user_features['user_index'], len(batch_books))
            country_input_batch = np.repeat(user_features['country_code'], len(batch_books))
            age_input_batch = np.repeat(user_features['age_normalized'], len(batch_books))

            item_input_batch = batch_books['item_index'].values
            publisher_input_batch = batch_books['publisher_code'].values
            year_input_batch = batch_books['year_normalized'].values

            # Make predictions
            batch_pred = model.predict([
                user_input_batch,
                country_input_batch,
                age_input_batch,
                item_input_batch,
                publisher_input_batch,
                year_input_batch
            ], verbose=0)

            # Store predictions with book IDs
            for j, pred in enumerate(batch_pred):
                predictions.append({
                    'ISBN': batch_books.iloc[j]['ISBN'],
                    'prediction': pred[0]
                })

        # Sort by prediction score and get top N
        predictions_df = pd.DataFrame(predictions)
        top_books = predictions_df.sort_values('prediction', ascending=False).head(top_n)

        # Merge with book details
        recommendations = top_books.merge(
            books_df[['ISBN', 'Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']],
            on='ISBN',
            how='left'
        )

        return recommendations[['ISBN', 'Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']]

    except Exception as e:
        print(f"Error in get_recommendations: {e}")
        return handle_cold_start_user(books_df, ratings_df, top_n)

def get_similar_items(book_title, model, books_df, top_n=10):
    """
    Find books similar to a given book based on content features.
    Includes cold start handling for new items.
    """
    try:
        # Find the target book
        target_book = books_df[books_df['Book-Title'] == book_title]
        if target_book.empty:
            # Cold start: book doesn't exist, use content-based similarity
            return handle_cold_start_item(book_title, books_df, top_n)

        # Get all books
        book_features = prepare_book_features(books_df)
        target_features = prepare_book_features(target_book)

        # Extract item embeddings from the model
        item_embedding_model = tf.keras.Model(
            inputs=model.input[3],  # item_input
            outputs=model.get_layer('item_embedding').output
        )

        # Get embeddings for all books
        all_embeddings = item_embedding_model.predict(
            book_features['item_index'].values,
            verbose=0
        )

        # Get embedding for target book
        target_embedding = item_embedding_model.predict(
            target_features['item_index'].values,
            verbose=0
        )

        # Calculate cosine similarity
        similarities = cosine_similarity(target_embedding, all_embeddings)[0]

        # Create results DataFrame
        results = books_df.copy()
        results['similarity'] = similarities

        # Sort by similarity and get top N (excluding the target book)
        similar_books = results[results['Book-Title'] != book_title]
        similar_books = similar_books.sort_values('similarity', ascending=False).head(top_n)

        return similar_books[['ISBN', 'Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']]

    except Exception as e:
        print(f"Error in get_similar_items: {e}")
        return handle_cold_start_item(book_title, books_df, top_n)

def handle_cold_start_user(books_df, ratings_df, top_n=10):
    """
    Handle cold start for new users by recommending popular items.
    """
    # Calculate popularity based on number of ratings
    popularity = ratings_df['ISBN'].value_counts().reset_index()
    popularity.columns = ['ISBN', 'count']

    # Get top N popular books
    popular_books = popularity.head(top_n).merge(
        books_df[['ISBN', 'Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']],
        on='ISBN',
        how='left'
    )

    return popular_books[['ISBN', 'Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']]

def handle_cold_start_item(book_title, books_df, top_n=10):
    """
    Handle cold start for new items using content-based similarity.
    """
    # Create TF-IDF matrix of book titles and authors
    tfidf = TfidfVectorizer(stop_words='english')
    books_df['content'] = books_df['Book-Title'] + ' ' + books_df['Book-Author'] + ' ' + books_df['Publisher'].fillna('')
    tfidf_matrix = tfidf.fit_transform(books_df['content'])

    # Calculate cosine similarity
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Get the index of the book
    indices = pd.Series(books_df.index, index=books_df['Book-Title']).drop_duplicates()

    if book_title not in indices:
        # Book not found, return popular books
        return books_df[['ISBN', 'Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']].head(top_n)

    idx = indices[book_title]

    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort books based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get scores of the top_n most similar books (skip the first as it's the same book)
    sim_scores = sim_scores[1:top_n+1]

    # Get book indices
    book_indices = [i[0] for i in sim_scores]

    # Return top N most similar books
    return books_df.iloc[book_indices][['ISBN', 'Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication']]

def prepare_user_features(user_data):
    """
    Prepare user features for model input.
    """
    # Normalize age (assuming age range 5-90 as in preprocessing)
    age_normalized = (user_data['Age'].values[0] - 5) / (90 - 5)

    # Get user index
    user_index = user_data['user_index'].values[0]

    # Get country code
    country_code = user_data['country_code'].values[0]

    return {
        'user_index': user_index,
        'country_code': country_code,
        'age_normalized': age_normalized
    }

def prepare_book_features(books_data):
    """
    Prepare book features for model input.
    """
    # Create a copy to avoid modifying the original
    books_copy = books_data.copy()

    # Ensure required columns exist
    if 'item_index' not in books_copy.columns:
        books_copy['item_index'] = range(len(books_copy))

    if 'publisher_code' not in books_copy.columns:
        unique_publishers = books_copy['Publisher'].unique()
        publisher_map = {publisher: i for i, publisher in enumerate(unique_publishers)}
        books_copy['publisher_code'] = books_copy['Publisher'].map(publisher_map)

    if 'year_normalized' not in books_copy.columns:
        books_copy['year_normalized'] = (books_copy['Year-Of-Publication'] - 1900) / (2023 - 1900)

    return books_copy

Overwriting model_utils.py


In [19]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import tensorflow as tf
from pathlib import Path
import os
from model_utils import get_recommendations, get_similar_items

# Set up paths
BASE = Path("/content/drive/MyDrive/Fedshi_Machine_Learning")
PROCESSED_DIR = BASE / "processed_data"
MODELS_DIR = BASE / "models"

# Load data and model
@st.cache_data
def load_data():
    books = pd.read_csv(PROCESSED_DIR / 'books_processed.csv')
    users = pd.read_csv(PROCESSED_DIR / 'users_processed.csv')
    ratings = pd.read_csv(PROCESSED_DIR / 'ratings_processed.csv')
    model = tf.keras.models.load_model(MODELS_DIR / 'recommendation_model.h5')
    return books, users, ratings, model

books, users, ratings, model = load_data()

st.title("Book Recommendation System")
st.write("This system provides personalized book recommendations using a hybrid approach combining collaborative filtering and content-based features.")

option = st.sidebar.selectbox("Select Recommendation Type",
                             ["User-Based Recommendations", "Item-Based Similarity"])

if option == "User-Based Recommendations":
    st.header("Personalized Recommendations")
    user_id = st.selectbox("Select User ID", users['User-ID'].head(1000).tolist())

    if st.button("Get Recommendations"):
        with st.spinner('Generating recommendations...'):
            recommendations = get_recommendations(user_id, model, books, users, ratings)

        st.success("Top Recommendations:")
        for i, row in recommendations.iterrows():
            st.write(f"{i+1}. **{row['Book-Title']}** by {row['Book-Author']}")
            st.write(f"   Publisher: {row['Publisher']}, Year: {row['Year-Of-Publication']}")

else:
    st.header("Find Similar Books")
    book_title = st.selectbox("Select Book", books['Book-Title'].head(1000).tolist())

    if st.button("Find Similar Items"):
        with st.spinner('Finding similar books...'):
            similar_items = get_similar_items(book_title, model, books)

        st.success("Similar Books:")
        for i, row in similar_items.iterrows():
            st.write(f"{i+1}. **{row['Book-Title']}** by {row['Book-Author']}")
            st.write(f"   Publisher: {row['Publisher']}, Year: {row['Year-Of-Publication']}")

# Add metrics section
st.sidebar.header("Metrics")
st.sidebar.write("""
**Metrics Tracked:**
- Precision@K
- Recall@K
- NDCG@K
- Coverage
- Personalization
- Response Time

These metrics evaluate both recommendation quality and system performance.
""")

Writing app.py


In [None]:
# Install required packages
!pip install streamlit scikit-learn

# Run the Streamlit app
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8502[0m
[34m  External URL: [0m[1mhttp://34.169.223.36:8502[0m
[0m
your url is: https://cruel-items-find.loca.lt
2025-09-18 11:37:00.282442: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758195420.335547   25927 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758195420.352311   25927 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1758195420.406787   259

In [24]:
!curl https://loca.lt/mytunnelpassword

34.169.223.36

In [23]:
from pyngrok import ngrok
import time
import threading

# Kill existing tunnels
ngrok.kill()

# Start Streamlit in a thread or background
def run_streamlit():
    !streamlit run app.py --server.port 8501

thread = threading.Thread(target=run_streamlit)
thread.start()

# Wait for Streamlit to start
time.sleep(5)

# Open a tunnel
public_url = ngrok.connect(8501, bind_tls=True)
print("Streamlit App URL:", public_url)


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.169.223.36:8501[0m
[0m


ERROR:pyngrok.process.ngrok:t=2025-09-18T11:30:33+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-09-18T11:30:33+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"


PyngrokNgrokError: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.