In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import joblib
import gc

In [None]:
# 1. Load data (optionally sample if too large)
df = pd.read_csv('kindle_data-v2.csv')
# df = df.sample(20000)  # Uncomment to work with subset

In [None]:
# Check available columns
print("Available columns:", df.columns.tolist())

Available columns: ['asin', 'title', 'author', 'soldBy', 'imgUrl', 'productURL', 'stars', 'reviews', 'price', 'isKindleUnlimited', 'category_id', 'isBestSeller', 'isEditorsPick', 'isGoodReadsChoice', 'publishedDate', 'category_name']


In [None]:
# 2. Preprocessing
df = df.fillna('')
df['content'] = df['title'] + ' ' + df['author'] + ' ' + df['category_name']

In [53]:
# 3. Create TF-IDF matrix with limited features
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=10000,  # Reduced vocabulary size
    ngram_range=(1, 2)
)

In [57]:
# Create content using only existing columns
content_parts = []
for col in ['title', 'author', 'category_name', 'description']:
    if col in df.columns:
        content_parts.append(df[col])

if not content_parts:
    raise ValueError("No valid columns found to create content")

# Correct way to combine columns
df['content'] = df[content_parts[0].name].astype(str)
for col_df in content_parts[1:]:
    df['content'] = df['content'] + ' ' + col_df.astype(str)

In [59]:
# 3. Preprocessing - create content from available columns
df = df.fillna('')

# Create content using only existing columns
available_columns = [col for col in ['title', 'author', 'category_name', 'description'] if col in df.columns]

if not available_columns:
    raise ValueError("No valid columns found to create content")

# Initialize with first available column
df['content'] = df[available_columns[0]].astype(str)

# Add remaining columns
for col in available_columns[1:]:
    df['content'] = df['content'] + ' ' + df[col].astype(str)

# Verify
print("Sample combined content:")
print(df['content'].head(3))


Sample combined content:
0    Adult Children of Emotionally Immature Parents...
1    From Strength to Strength: Finding Success, Ha...
2    Good Inside: A Guide to Becoming the Parent Yo...
Name: content, dtype: object


In [61]:
# Create content from available columns (adapt based on your actual columns)
available_text_columns = [col for col in ['title', 'author', 'category_name', 'description'] 
                         if col in df.columns]

if not available_text_columns:
    raise ValueError("No text columns available for recommendations")

# Combine text columns row by row
df['content'] = df[available_text_columns[0]].astype(str)
for col in available_text_columns[1:]:
    df['content'] = df['content'] + ' ' + df[col].astype(str)

In [71]:
# 4. Create TF-IDF matrix
print("\nCreating TF-IDF features...")
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=10000,  # Reduced memory usage
    ngram_range=(1, 2)
)
# Ensure content is in the correct string format
text_data = df['content'].tolist()  # Convert to list of strings
tfidf_matrix = tfidf.fit_transform(text_data)




Creating TF-IDF features...


In [72]:
# 5. Build recommendation model
print("Training recommendation model...")
nn = NearestNeighbors(
    n_neighbors=11,  # 10 similar + itself
    metric='cosine',
    algorithm='brute',
    n_jobs=-1
)
nn.fit(tfidf_matrix)

Training recommendation model...


In [75]:
# 6. Save artifacts
joblib.dump(nn, 'recommendation_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
df.to_csv('processed_books.csv', index=False)

In [77]:
# Clean up
del df['content']
gc.collect()

print("\nSuccessfully saved:")
print("- recommendation_model.pkl (trained model)")
print("- tfidf_vectorizer.pkl (text processor)")
print("- processed_books.csv (cleaned data)")


Successfully saved:
- recommendation_model.pkl (trained model)
- tfidf_vectorizer.pkl (text processor)
- processed_books.csv (cleaned data)
