In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
!pip install fastfm
from fastFM import mcmc, als
import warnings
from scipy import sparse
from scipy.sparse import vstack

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

def load_data():
    """Load and merge MovieLens 100K data"""
    print("Loading data...")

    # Load ratings data
    ratings_url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"
    column_names = ['user_id', 'item_id', 'rating', 'timestamp']
    ratings = pd.read_csv(ratings_url, sep='\t', names=column_names)

    # Load user information
    users_url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.user"
    users = pd.read_csv(users_url, sep='|',
                       names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

    # Load movie information
    movies_url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.item"
    movies = pd.read_csv(movies_url, sep='|', encoding='latin-1',
                        names=['item_id', 'title', 'release_date', 'video_release_date',
                              'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                              'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                              'Thriller', 'War', 'Western'])

    # Merge all data
    data = pd.merge(ratings, users, on='user_id')
    data = pd.merge(data, movies, on='item_id')

    return data, users, movies

def preprocess_data(data):
    """Preprocess and encode categorical features"""
    print("Preprocessing data...")

    # Initialize encoders
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    occupation_encoder = LabelEncoder()
    gender_encoder = LabelEncoder()

    # Encode categorical variables
    data['user_id_encoded'] = user_encoder.fit_transform(data['user_id'])
    data['item_id_encoded'] = item_encoder.fit_transform(data['item_id'])
    data['occupation_encoded'] = occupation_encoder.fit_transform(data['occupation'])
    data['gender_encoded'] = gender_encoder.fit_transform(data['gender'])

    return data, user_encoder, item_encoder, occupation_encoder, gender_encoder

def create_feature_matrix(data, genre_columns):
    """Create feature matrix with basic and genre features"""
    print("Creating feature matrix...")

    # Basic features
    basic_features = ['user_id_encoded', 'item_id_encoded', 'age', 'gender_encoded', 'occupation_encoded']

    # Add genre features
    genre_features = data[genre_columns].values

    # Combine all features
    from scipy import sparse
    # Use sparse.hstack to create a sparse matrix instead of np.hstack
    full_features = sparse.hstack([sparse.csr_matrix(data[basic_features].values), sparse.csr_matrix(genre_features)])

    return full_features

def train_fm_model(X_train, y_train, method='mcmc'):
    """Train Factorization Machine model"""
    print(f"Training FM model using {method.upper()}...")

    if method == 'mcmc':
        # Instead of mcmc.FMRegression, try using als.FMRegression
        fm = als.FMRegression(n_iter=100, rank=8, init_stdev=0.1, random_state=42)
    else:  # als
        fm = als.FMRegression(n_iter=100, rank=8, init_stdev=0.1, random_state=42)

    # Convert y_train to a NumPy array with dtype=np.float64
    y_train = np.asarray(y_train, dtype=np.float64)

    fm.fit(X_train, y_train)
    return fm

def evaluate_model(model, X_test, y_test):
    """Evaluate model performance"""
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Test RMSE: {rmse:.4f}")
    return rmse

def recommend_movies(model, user_id, movies, users, user_encoder, item_encoder,
                    gender_encoder, occupation_encoder, genre_columns, top_n=10):
    """Generate movie recommendations for a given user"""
    print(f"\nGenerating recommendations for user {user_id}...")

    # Get user info
    user_info = users[users['user_id'] == user_id].iloc[0]

    # Encode user features
    user_encoded = user_encoder.transform([user_id])[0]
    age = user_info['age']
    gender_encoded = gender_encoder.transform([user_info['gender']])[0]
    occupation_encoded = occupation_encoder.transform([user_info['occupation']])[0]

    # Prepare base user features
    user_features = np.array([user_encoded, 0, age, gender_encoded, occupation_encoded])

       # Create feature matrix for all movies
    all_features = []
    for _, movie in movies.iterrows():
        item_encoded = item_encoder.transform([movie['item_id']])[0]
        # Convert genre_vector to numeric type before creating sparse matrix
        genre_vector = movie[genre_columns].astype(int).values

        # Create feature vector
        # Use sparse matrices for user_features and genre_vector
        feature_vector = sparse.hstack([
            sparse.csr_matrix(user_features[:1]),  # user_id_encoded
            sparse.csr_matrix([item_encoded]),     # item_id_encoded
            sparse.csr_matrix(user_features[2:]),  # age, gender_encoded, occupation_encoded
            sparse.csr_matrix(genre_vector)       # genre features
        ])
        all_features.append(feature_vector)

    # Stack sparse matrices vertically

    all_features = vstack(all_features)

    # Predict ratings
    pred_ratings = model.predict(all_features)

    # Get top N recommendations
    top_indices = np.argsort(pred_ratings)[-top_n:][::-1]
    recommendations = movies.iloc[top_indices][['item_id', 'title']].copy()
    recommendations['predicted_rating'] = pred_ratings[top_indices]

    return recommendations

def tune_hyperparameters(X_train, y_train):
    """Tune FM hyperparameters using grid search"""
    print("\nTuning hyperparameters...")

    param_grid = {
        'rank': [4, 8, 12],
        'n_iter': [50, 100, 150]
    }

    fm_als = als.FMRegression(init_stdev=0.1, random_state=42)
    grid_search = GridSearchCV(fm_als, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    print("Best parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

def main():
    # Load and prepare data
    data, users, movies = load_data()
    data, user_encoder, item_encoder, occupation_encoder, gender_encoder = preprocess_data(data)

    # Get genre columns (last 19 columns in movies dataframe)
    genre_columns = movies.columns[-19:]

    # Create feature matrix
    X = create_feature_matrix(data, genre_columns)
    y = data['rating'].values

    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train initial model
    fm_model = train_fm_model(X_train, y_train, method='mcmc')
    evaluate_model(fm_model, X_test, y_test)

    # Generate recommendations for a sample user
    sample_user = 123
    recs = recommend_movies(fm_model, sample_user, movies, users, user_encoder,
                           item_encoder, gender_encoder, occupation_encoder, genre_columns)
    print("\nTop 10 Recommendations:")
    print(recs[['title', 'predicted_rating']].to_string(index=False))

if __name__ == "__main__":
    main()

Loading data...
Preprocessing data...
Creating feature matrix...
Training FM model using MCMC...
Test RMSE: 208.7442

Generating recommendations for user 123...

Top 10 Recommendations:
                                                 title  predicted_rating
                                    Kansas City (1996)        274.701222
                                       Bad Boys (1995)        273.612734
                            Sound of Music, The (1965)        252.242825
                                     Four Rooms (1995)        226.142867
                                         Brazil (1985)        223.951956
                            Clockwork Orange, A (1971)        221.445814
Wallace & Gromit: The Best of Aardman Animation (1996)        197.392368
                                  Dirty Dancing (1987)        181.531719
                                     Unforgiven (1992)        168.375829
                                   Blade Runner (1982)        159.646153
