In [6]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import os
import pandas as pd
from IPython.display import clear_output
!pip install cupy-cuda11x



In [2]:
!gdown 1hUqu1mbFeTEfBvl-7fc56fHFfCSzIktD
!unzip -qq ml1m.zip -d ml1m

Downloading...
From: https://drive.google.com/uc?id=1hUqu1mbFeTEfBvl-7fc56fHFfCSzIktD
To: /content/ml1m.zip
100% 105M/105M [00:01<00:00, 99.8MB/s]


In [31]:
users = pd.read_csv('ml1m/content/dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('ml1m/content/dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pd.read_csv('ml1m/content/dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('ml1m/content/dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_test['genre'] = movies_test.genre.str.split('|')

In [32]:
# Convert the columns to the appropriate type.
users.age = users.age.astype('category')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('category')
ratings.userid = ratings.userid.astype('category')

In [8]:
users_encoded = pd.get_dummies(users, columns=['gender', 'age', 'occupation'])
users_encoded = users_encoded.drop(columns="zip")

In [9]:
ratings_users = pd.merge(ratings, users_encoded, left_on='userid', right_index=True)
ratings_users.drop(columns=["timestamp"])

Unnamed: 0,userid,movieid,rating,gender_F,gender_M,age_1,age_18,age_25,age_35,age_45,...,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20
0,1,1193,5,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,661,3,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,914,3,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,3408,4,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2355,5,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1000205,6040,1094,5,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1000206,6040,562,5,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1000207,6040,1096,4,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
one_hot_columns = users_encoded.columns.difference(['UserID'])

In [11]:
unique_genres = set()
for genres in movies_train['genre']:
    unique_genres.update(genres)
unique_genres = sorted(unique_genres)

In [12]:
# Change all ratings to be 1
ratings.rating = 1

In [13]:
# Pivot and create movie-user matrix
movie_to_user = ratings.pivot(index='movieid', columns='userid', values='rating').fillna(0)
movie_to_user

userid,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Transform matrix to scipy sparse matrix
movie_to_user_sparse = csr_matrix(movie_to_user.values)
movie_to_user_sparse

<3706x6040 sparse matrix of type '<class 'numpy.float64'>'
	with 1000209 stored elements in Compressed Sparse Row format>

In [27]:
dataset_relative_path = 'ml1m/content/dataset'
images_directory_name = 'ml1m-images'
image_paths = os.path.join(dataset_relative_path, images_directory_name)

def extract_features(img_path, model):
    img_path = os.path.join(image_paths, img_path)
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = model.predict(img_array)
    return features

In [16]:
knn_movie_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_movie_model.fit(movie_to_user_sparse)

In [17]:
def get_movie_name(movieid):
    try:
        return movies_train.loc[movieid].title
    except KeyError:
        return movies_test.loc[movieid].title

In [18]:
## function to find top n similar users of the given input user
def get_similar_movies_knn(movieid, n = 15):
    ## input to this function is the movieid and number of top similar movies you want.
    knn_input = np.asarray(movie_to_user.loc[movieid]).reshape(1,-1)
    n = min(n, movie_to_user.shape[0])
    distances, indices = knn_movie_model.kneighbors(knn_input, n_neighbors = n + 1)
    return distances, indices

In [19]:
import pandas as pd
import numpy as np

# Assuming 'movies_train' and 'movie_to_user' are already defined in your context

def create_genre_to_genre_data_knn():
    # Collect all unique genres
    unique_genres = set()
    for genres in movies_train['genre']:
        unique_genres.update(genres)
    unique_genres = sorted(unique_genres)

    # Mapping of genre to index
    genre_to_index = {genre: idx for idx, genre in enumerate(unique_genres)}

    all_movie_data = []
    all_movie_labels = []

    # Iterate over each movie in the dataset
    for movieid in movie_to_user.index:
        # Check if the current movie is in the movies_train dataset
        if movieid not in movies_train.index:
            continue

        for num_movies in range(1,20):
            # Find similar movies
            distance, indices = get_similar_movies_knn(movieid, num_movies)

            # Initialize a zero vector for genre counts
            genre_counts = [0] * len(unique_genres)


            for i in range(len(indices.flatten())):
                other_movie_id = movie_to_user.index[indices.flatten()[i]]
                # Check if the other movie is in the movies_train dataset
                if other_movie_id in movies_train.index:
                  # Use the inverse of distance as weight (add a small constant to avoid division by zero)
                  weight = distance.flatten()[i]

                  # Increment weighted count for each genre of the other movie
                  for genre in movies_train.loc[other_movie_id]['genre']:
                    genre_counts[genre_to_index[genre]] += 1 / (1 + np.exp(-weight))
            # Create one-hot encoded label for the current movie
            current_movie_label = [0] * len(unique_genres)
            for genre in movies_train.loc[movieid]['genre']:
                current_movie_label[genre_to_index[genre]] = 1

            # Append to the respective lists
            all_movie_data.append(genre_counts)
            all_movie_labels.append(current_movie_label)

    return all_movie_data, all_movie_labels

In [20]:
encoded_genre_data, encoded_genre_labels = create_genre_to_genre_data_knn()
# Convert the lists to numpy arrays
X = np.array(encoded_genre_data)
y = np.array(encoded_genre_labels)

In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split



def KNNSubmodel():
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define a more complex model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(y_train.shape[1], activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

    # Define callbacks
    checkpoint_filepath = '/tmp/checkpoint'
    model_checkpoint_callback = ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        save_best_only=True)

    early_stopping_callback = EarlyStopping(
        monitor='val_loss',
        patience=5,  # Number of epochs with no improvement after which training will be stopped
        restore_best_weights=True)

    # Train the model with the callbacks
    history = model.fit(
        X_train, y_train,
        epochs=10,
        batch_size=32,
        validation_split=0.2,
        callbacks=[model_checkpoint_callback, early_stopping_callback])

    # The model's weights will now be in the state of the best epoch


    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    #model.save('KNNSubmodel.h5')

    return model

In [22]:
KNN_sub_model = KNNSubmodel()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 68.09%


In [23]:
def genres_to_genres_KNN(encoded_genres, unique_genres):
    """
    Convert encoded genre vectors back to genre names.
    """
    genre_names = []
    for genre_vector in encoded_genres:
        genres = [unique_genres[idx] for idx, val in enumerate(genre_vector) if val > 0.5]  # Threshold of 0.5 for binary classification
        genre_names.append(genres)
    return genre_names

In [24]:
def predict_genres_KNN(movieid, n = 15):
    distances, indices = get_similar_movies_knn(movieid, n)
    genres = []
    for i in range(1, len(distances.flatten())):
        # Exclude movies in the test set
        if movie_to_user.index[indices.flatten()[i]] in movies_train.index:
            genres.extend(movies_train.loc[movie_to_user.index[indices.flatten()[i]]].genre)
    genres = pandas.Series(genres)
    threshold = genres.value_counts().max() / 2
    x = genres.value_counts()[genres.value_counts() > threshold].head(3).index.tolist()
    print(x)
    print(type(x))
    return x

In [25]:
def predict_genres_KNN_submodel(movieid, n = 10):

    # Mapping of genre to index
    genre_to_index = {genre: idx for idx, genre in enumerate(unique_genres)}

    # Find 15 similar movies
    distances, indices = get_similar_movies_knn(movieid, n)

    # Initialize a zero vector for genre counts
    genre_counts = [0] * len(unique_genres)

    for i in range(len(indices.flatten())):
        other_movie_id = movie_to_user.index[indices.flatten()[i]]
        # Check if the other movie is in the movies_train dataset
        if other_movie_id in movies_train.index:
            weight = distances.flatten()[i]
            # Increment weighted count for each genre of the other movie
            for genre in movies_train.loc[other_movie_id]['genre']:
                genre_counts[genre_to_index[genre]] += 1 / (1 + np.exp(-weight))

    movie_ratings = ratings_users[ratings_users['movieid'] == movieid]
     # Predict genres
    genre_counts = np.array(genre_counts).reshape(1, -1)
    #print(genre_counts)
    y_pred = KNN_sub_model.predict(genre_counts)
    #print(y_pred)
    # Convert predictions to genre names
    predicted_genres = genres_to_genres_KNN(y_pred, unique_genres)
    #print(predicted_genres)
    # Flatten the list of lists to a single list
    flattened_genres = [genre for sublist in predicted_genres for genre in sublist]

    return flattened_genres

In [28]:
id_list = []
for img_path in os.listdir(image_paths):
    id_list.append(int(os.path.splitext(img_path)[0]))

In [29]:
# Remove all movies that don't have a rating
originallen = len(movies_test)
movies_test_with_rating = movies_test[movies_test.index.isin(ratings.movieid)]
print('Removed %d movies without ratings' % (originallen - len(movies_test_with_rating)))

# Predict genres for all movies in the test set
movies_test_with_rating['predicted_genres'] = movies_test_with_rating.index.map(predict_genres_KNN_submodel)
clear_output()
movies_test_with_rating.head()

Unnamed: 0_level_0,title,genre,predicted_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]","[Children's, Comedy]"
2067,Doctor Zhivago (1965),"[Drama, Romance, War]","[Drama, Romance, War]"
2651,Frankenstein Meets the Wolf Man (1943),[Horror],[Horror]
2989,For Your Eyes Only (1981),[Action],[Action]
3415,"Mirror, The (Zerkalo) (1975)",[Drama],"[Drama, Sci-Fi]"


In [33]:
# Calculate multi-label f1 score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(movies_test_with_rating.genre)
y_pred = mlb.transform(movies_test_with_rating.predicted_genres)
print(classification_report(y_true, y_pred, target_names=mlb.classes_))
from sklearn.metrics import f1_score
f1_score(y_true, y_pred, average='weighted')

              precision    recall  f1-score   support

      Action       0.82      0.82      0.82        89
   Adventure       0.78      0.66      0.71        47
   Animation       0.90      0.90      0.90        21
  Children's       0.88      0.92      0.90        48
      Comedy       0.86      0.82      0.84       239
       Crime       0.48      0.34      0.40        29
 Documentary       0.74      0.71      0.72        24
       Drama       0.77      0.75      0.76       293
     Fantasy       1.00      0.86      0.92         7
   Film-Noir       0.67      0.67      0.67         6
      Horror       0.87      0.88      0.87        74
     Musical       0.53      0.62      0.57        13
     Mystery       0.83      0.56      0.67        18
     Romance       0.70      0.53      0.60        92
      Sci-Fi       0.85      0.85      0.85        48
    Thriller       0.75      0.67      0.71       106
         War       0.79      0.60      0.68        25
     Western       0.89    

  _warn_prf(average, modifier, msg_start, len(result))


0.7660487494906209