In [53]:
import pandas as pd
from rank_bm25 import BM25Okapi
import tqdm
import pandas as pd
import nltk
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from gensim.parsing.preprocessing import remove_stopwords

from nltk.stem import PorterStemmer
from collections import Counter

from sentence_transformers import SentenceTransformer

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### In this script: 
* Merging Wikipedia Movie Plots + MovieLens data to get full plot + ratings
* Splitting ratings for every user into train/test/validation
* Preprocessing of Movie Data + tokenization
* Getting 2 similarity matrices with similarities between all movies in the database
    * BM25 (baseline)
    * BERT embeddings with cosine similarity (neural model)

### New MovieLens dataset (2018)


In [54]:
ml_movies = pd.read_csv("data/ml-latest-small/movies.csv")
ml_ratings = pd.read_csv("data/ml-latest-small/ratings.csv")
ml_links = pd.read_csv("data/ml-latest-small/links.csv")
ml_tags = pd.read_csv("data/ml-latest-small/tags.csv")

In [55]:
movie_sample = pd.read_csv("data/wiki_movie_plots_deduped.csv")

In [56]:
columns = movie_sample.columns.to_list()

for column in columns:
    try:
        movie_sample[column] = movie_sample[column].str.lower()
    except AttributeError:
        pass

In [57]:
# changing column names to later merge based on these columns with the other dataset
movie_sample = movie_sample.rename(columns={"Title": "title", "Release Year": "year"})
movie_sample["year"] = movie_sample["year"].astype(str)
movie_sample["title"] = movie_sample["title"].str.lower().replace(r"^a-zA-Z\d", r"", regex=True).replace(r"^ +| +$", r"", regex=True).str.replace(",", "").str.replace("'", "").str.replace(":", "").str.replace("!", "")


# removing unknown values
movie_sample = movie_sample.dropna(subset=['Plot', "Cast", "Director", "Genre"])
movie_sample = movie_sample.fillna("unknown")
movie_sample.loc[(movie_sample["Plot"] != "unknown") & (movie_sample["Cast"] != "unknown") & (movie_sample["Genre"] != "unknown") & (movie_sample["Director"] != "unknown")]
# movie_sample.loc[movie_sample["Plot"] != "unknown"]
# movie_sample.loc[movie_sample["Cast"] != "unknown"]
# movie_sample.loc[movie_sample["Genre"] != "unknown"]
# movie_sample.loc[movie_sample["Director"] != "unknown"]

Unnamed: 0,year,title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
13,1907,daniel boone,american,wallace mccutcheon and ediwin s. porter,"william craven, florence lawrence",biographical,https://en.wikipedia.org/wiki/daniel_boone_(19...,boone's daughter befriends an indian maiden as...
15,1907,laughing gas,american,edwin stanton porter,"bertha regustus, edward boulden",comedy,https://en.wikipedia.org/wiki/laughing_gas_(fi...,the plot is that of a black woman going to the...
16,1908,the adventures of dollie,american,d. w. griffith,"arthur v. johnson, linda arvidson",drama,https://en.wikipedia.org/wiki/the_adventures_o...,on a beautiful summer day a father and mother ...
17,1908,the black viper,american,d. w. griffith,d. w. griffith,drama,https://en.wikipedia.org/wiki/the_black_viper,a thug accosts a girl as she leaves her workpl...
18,1908,a calamitous elopement,american,d.w. griffith,"harry solter, linda arvidson",comedy,https://en.wikipedia.org/wiki/a_calamitous_elo...,a young couple decides to elope after being ca...
...,...,...,...,...,...,...,...,...
34877,2013,particle (film),turkish,erdem tepegöz,"jale arıkan, rüçhan caliskur, özay fecht, remz...",drama film,https://en.wikipedia.org/wiki/particle_(film),"zeynep lost her job at weaving factory, and he..."
34882,2017,çalgı çengi i̇kimiz,turkish,selçuk aydemir,"ahmet kural, murat cemcir",comedy,https://en.wikipedia.org/wiki/%c3%87alg%c4%b1_...,"two musicians, salih and gürkan, described the..."
34883,2017,olanlar oldu,turkish,hakan algül,"ata demirer, tuvana türkay, ülkü duru",comedy,https://en.wikipedia.org/wiki/olanlar_oldu,"zafer, a sailor living with his mother döndü i..."
34884,2017,non-transferable,turkish,brendan bradley,"youtubers shanna malcolm, shira lazar, sara fl...",romantic comedy,https://en.wikipedia.org/wiki/non-transferable...,the film centres around a young woman named am...


In [58]:
ml_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [59]:
sample_list_raw = movie_sample[["title", "Director", "Cast", "Genre", "Plot"]].values.tolist()

In [60]:
delimiter = ", "
sample_list_processed = []
for nr, movie in enumerate(sample_list_raw):
    joined = delimiter.join(movie)
    joined = joined.lower()
    joined = joined.replace(",", "")
    joined = joined.replace("\r\n", "")
    joined = joined.replace(".", " ")
    joined = joined.replace("\'s", "s")
    joined = joined.replace("-", "")
    joined = joined.replace('"', '')

    sample_list_processed.append(joined)
   


the movie plot database has 33464 movies

In [61]:
len(sample_list_processed)


33464

### Getting Movie Lens dataset - movies and users + their ratings & looking at the distribution in the data

In [62]:
ml_movies["year"] = ml_movies["title"].str.extract("\((\d{4})\)", expand=True)
ml_movies["title"] = ml_movies["title"].str.replace("\(\d{4}\)", "", regex=True)

""" Source: removing leading and tailing whitespaces: https://stackoverflow.com/questions/49551336/pandas-trim-leading-trailing-white-space-in-a-dataframe
Removing non-alphanumeric characters: https://stackoverflow.com/questions/6053541/regex-every-non-alphanumeric-character-except-white-space-or-colon"""
ml_movies["title"] = ml_movies["title"].str.lower().replace(r"^a-zA-Z\d", r"", regex=True).replace(r"^ +| +$", r"", regex=True).str.replace(",", "").str.replace("'", "").str.replace(":", "").str.replace("!", "")
ml_movies["genres"] = ml_movies["genres"].str.lower().str.replace("|", ",")
ml_movies["year"] = ml_movies["year"].astype(str)

  ml_movies["genres"] = ml_movies["genres"].str.lower().str.replace("|", ",")


In [63]:
# merging our movie dataset with the Movielens dataset
merged_movie_df = pd.merge(ml_movies, movie_sample, on=["title", "year"], how="inner")
movie_ids = merged_movie_df["movieId"].values.tolist()

based on merging the datasets on the movie name and year we ended up with 4859 movies where we have the full plot as well as valid ratings

In [64]:
merged_movie_df.shape

(4859, 10)

### Filtering the ratings of the user and turning them into a training/test and validation set

In [65]:
ratings_filtered = ml_ratings.loc[ml_ratings["movieId"].isin(movie_ids)]
ratings_filtered["timestamp"] = pd.to_datetime(ratings_filtered["timestamp"], unit="s")
ratings_filtered = ratings_filtered.sort_values(['userId'], ascending=[True])

user_ids = list(set(ratings_filtered["userId"].values.tolist()))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_filtered["timestamp"] = pd.to_datetime(ratings_filtered["timestamp"], unit="s")


In [87]:
train_dfs = []
test_dfs = []
validation_dfs = []

for user_id in user_ids:
    user_ratings = ratings_filtered.loc[ratings_filtered["userId"] == user_id]
    nr_of_ratings = len(user_ratings)
    train = user_ratings.head(int(nr_of_ratings*0.6))
    test = user_ratings.tail(int(nr_of_ratings*0.4))
    #val = test.tail(int(nr_of_ratings*0.2))
    #test = test.head(int(nr_of_ratings*0.2))

    train_dfs.append(train)
    test_dfs.append(test)
    #validation_dfs.append(val)
    
ratings_train = pd.concat(train_dfs)
ratings_test = pd.concat(test_dfs)
#ratings_validation = pd.concat(validation_dfs)

### saving ratings for training, testing and validation

* vielleicht brauchen wir eh nur training und testing 

In [90]:
ratings_train.to_csv("data/ratings_train2.csv", index=False)
ratings_test.to_csv("data/ratings_test2.csv", index=False)
#ratings_validation.to_csv("data/ratings_validation.csv", index=False)

In [68]:
relevant = ratings_test.loc[ratings_test["rating"] >= 4]

### BM25 Baseline implementation

In [69]:
def get_individual_tokens(df) -> list:
        """Take in DataFrame and return list of all individual terms
        and a list of lists of the tokens within the documents

        Args:
            df (DataFrame): Input df (should be self.documents)

        Returns:
            list: List of individual terms
            list: List of lists of terms per document
        """

        token_doc_list = df["tokens"].values.tolist()
        individual_terms = []
        for terms in token_doc_list:
            for word in terms:
                individual_terms.append(word)
                     
        individual_terms = list(set(individual_terms)) # keys
        return individual_terms, token_doc_list

In [70]:
def remove_stopwords_and_stem(tokens:list) -> list:
        """Get tokenized words, remove stop words and use PorterStemmer
        through list comprehension and return filtered words

        Args:
            tokens (list): Tokenized full text 

        Returns:
            list: Filtered tokenized full text
        """
        
        stopwords = nltk.corpus.stopwords.words("english")  # downloading stopwords from nltk didn't work
        filtered = [token for token in tokens if token not in stopwords]
        stemmer = PorterStemmer()
        filtered = [stemmer.stem(token) for token in filtered]
        return filtered

### Preprocessing the merged movie dataset for bm25

* stopword removal
* stemming
* merging title, cast, director, genre, plot
* tokenizing and using "," as delimiter to achieve that actors and director names are 1 token (e.g. [tom hanks] instead of [tom, hanks])

In [71]:
merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace(",", "")
merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace(".", "")

merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace("(", "")
merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace(")", "")
merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace(")", "")
merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace('"', "")
merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace("'", "")
merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace(" ", ",")
merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace("\r\n", ",")
merged_movie_df["Plot"] = merged_movie_df["Plot"].replace(to_replace=r'\[\d+\]', value='')

merged_movie_df["Cast"] = merged_movie_df["Cast"].str.replace(", ", ",")


  merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace(".", "")
  merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace("(", "")
  merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace(")", "")
  merged_movie_df["Plot"] = merged_movie_df["Plot"].str.replace(")", "")


In [72]:
merged_movie_df["Genre"] = merged_movie_df["Genre"].str.replace(", ", ",")

In [73]:
merged_movie_df["tokens_raw"] = merged_movie_df["title"] + "," + merged_movie_df["Genre"] + "," + merged_movie_df["Director"] + "," + merged_movie_df["Cast"] + "," + merged_movie_df["Plot"]

In [74]:
merged_movie_df["tokens_raw"] = merged_movie_df["tokens_raw"].apply(remove_stopwords)
merged_movie_df["tokens_raw"] = merged_movie_df["tokens_raw"].str.split(",")
merged_movie_df["tokens_bm25"] = merged_movie_df["tokens_raw"].apply(remove_stopwords_and_stem)



KeyboardInterrupt: 

We're ending up with 4859 movies that have a full plot and ratings

In [None]:
len(merged_movie_df)

4859

### 1. using bm25 to get a similarity matrix for every movie in the dataset

In [None]:
tokens_clean = merged_movie_df["tokens_bm25"].values.tolist()
movie_ids = merged_movie_df["movieId"].values.tolist()

bm25 = BM25Okapi(tokens_clean)


In [None]:
bm25_tokens = merged_movie_df["tokens_bm25"].values.tolist()

In [None]:
movie_and_tokens = {}

for movie, tokens in zip(movie_ids, bm25_tokens):
    movie_and_tokens[movie] = tokens



In [None]:
def get_movie_rank_and_scores(movie_and_tokens):

    movies_similarity = {}
    for movie in movie_and_tokens.keys():
        query_tokenized = movie_and_tokens[movie]
        doc_scores = bm25.get_scores(query_tokenized)
        movies_similarity[movie] = doc_scores

    return movies_similarity

In [None]:
bm25_similarity_matrix_raw = get_movie_rank_and_scores(movie_and_tokens)

In [None]:
bm25_similarity_matrix = pd.DataFrame(bm25_similarity_matrix_raw, index=movie_ids, columns=movie_ids)

In [None]:
#bm25_similarity_matrix.to_csv("data/bm25_similarity_matrix.csv")

man kann für die similarity matrices auch leicht die ID mit dem titel austauschen, um ein gefühl für die ähnlichen filme zu bekommen (achtung: manche filmnamen gibt es doppelt mit anderen jahreszahlen, deshalb sollte man für weitere berechnungen die ID's verwenden um errors zu vermeiden)

In [None]:
movie_titles = merged_movie_df["title"].values.tolist()
bm25_similarity_matrix.columns = movie_titles
bm25_similarity_matrix.index = movie_titles
bm25_similarity_matrix

Unnamed: 0,toy story,jumanji,grumpier old men,waiting to exhale,father of the bride part ii,heat,sabrina,tom and huck,sudden death,goldeneye,...,wonder,daddys home 2,the disaster artist,the post,the greatest showman,ferdinand,phantom thread,bright,gintama,black butler book of the atlantic
toy story,1786.604408,96.045632,50.879903,52.284392,152.010517,64.079492,49.505297,79.314031,83.574031,44.191233,...,94.697259,230.863146,49.869315,64.921028,83.944298,153.745666,84.905282,100.123638,8.739946,76.920939
jumanji,75.233209,1716.965921,61.524728,53.589610,131.747185,78.226928,47.284604,86.818971,88.030328,64.741193,...,111.153621,148.827257,66.140812,49.157362,105.185302,123.263617,56.361075,95.892033,13.410896,60.503902
grumpier old men,61.617220,92.220340,1017.613210,55.480918,98.191341,35.070715,25.770896,62.918114,47.781678,40.955120,...,97.364654,114.863267,37.371347,34.392881,88.386560,80.130090,69.727229,42.810735,5.720987,27.003674
waiting to exhale,54.150898,57.759952,39.564259,771.102812,95.709397,54.702384,36.595983,68.541913,65.409543,31.321633,...,60.073673,153.726363,41.625538,46.904494,92.744574,52.667353,52.163392,36.540693,8.201064,30.867273
father of the bride part ii,96.534989,124.871815,60.249621,43.711209,1722.775851,90.904314,56.305721,64.626040,76.394278,104.060437,...,112.519707,272.641205,98.781066,67.224339,99.959601,191.972946,110.753199,66.494412,13.529740,38.625998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ferdinand,117.561736,94.094923,55.615010,40.118537,239.706247,71.706712,43.707442,93.802624,107.038530,125.978959,...,74.939945,185.388403,58.407200,51.837741,78.953400,3128.521345,55.464643,126.124443,8.819513,70.181024
phantom thread,95.569541,65.959779,56.192432,55.871437,180.672410,62.965977,40.874140,69.028943,58.522587,51.876579,...,93.435668,191.769748,76.167859,64.408681,60.762098,86.803328,1380.447700,84.816614,0.000000,53.608793
bright,82.209792,85.493083,27.375127,35.502719,81.248479,108.346697,24.971202,88.003116,106.233109,109.150441,...,82.063283,132.597667,67.595218,52.248132,85.378749,127.098698,76.187281,2864.704693,8.538990,71.606508
gintama,14.038009,25.392046,5.466198,22.643185,25.306402,51.505626,14.887460,33.432761,11.029943,8.478168,...,17.927869,17.040656,38.842154,3.358852,19.712173,20.224664,0.000000,15.455576,306.820331,80.561884


### 2. using BERT embeddings and cosine similarity to get movie similarity matrix


In [76]:
merged_movie_df["tokens_bert"] = merged_movie_df["title"] + "," + merged_movie_df["Genre"] + "," + merged_movie_df["Director"] + "," + merged_movie_df["Cast"] + "," + merged_movie_df["Plot"]

tokens_bert = merged_movie_df["tokens_bert"].values.tolist()

In [77]:
""" Source: Using BERT embeddings to get similar movies: https://medium.com/geekculture/nlp-tutorial-movie-recommendation-system-using-bert-d281dc609add"""

bert = SentenceTransformer('bert-base-nli-mean-tokens')

sentence_embeddings = bert.encode(tokens_bert)
similarity = cosine_similarity(sentence_embeddings)

KeyboardInterrupt: 

In [None]:
movies = merged_movie_df["movieId"].values.tolist()
bert_similarities = pd.DataFrame(data=similarity, index=movies, columns=movies)

In [None]:
bert_similarities

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,180095,180231,180297,180497,180985,180987,181315,182823,191005,193581
1,1.000000,0.648643,0.605092,0.496094,0.773694,0.429486,0.654809,0.565022,0.463111,0.484742,...,0.656545,0.693257,0.721117,0.513948,0.672848,0.507768,0.532128,0.613353,0.445363,0.475026
2,0.648643,1.000000,0.674596,0.444419,0.659802,0.636512,0.652176,0.726117,0.661237,0.640183,...,0.668160,0.670507,0.668043,0.546959,0.616766,0.566365,0.481389,0.660264,0.527112,0.573363
3,0.605092,0.674596,1.000000,0.538836,0.715977,0.567193,0.815671,0.584686,0.617953,0.506850,...,0.673304,0.790290,0.665478,0.569067,0.663937,0.592974,0.606039,0.699589,0.598132,0.579098
4,0.496094,0.444419,0.538836,1.000000,0.612757,0.287217,0.609637,0.411334,0.435866,0.302620,...,0.405558,0.492546,0.490648,0.506490,0.426739,0.420877,0.596081,0.486736,0.469926,0.572693
5,0.773694,0.659802,0.715977,0.612757,1.000000,0.460315,0.709289,0.688867,0.570962,0.504957,...,0.704852,0.798235,0.738354,0.627870,0.686445,0.568341,0.612401,0.638353,0.458462,0.518412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180987,0.507768,0.566365,0.592974,0.420877,0.568341,0.560228,0.540970,0.624828,0.418278,0.493905,...,0.558965,0.642870,0.626865,0.655522,0.654809,1.000000,0.459236,0.689295,0.568151,0.539494
181315,0.532128,0.481389,0.606039,0.596081,0.612401,0.495233,0.661764,0.439365,0.530975,0.542008,...,0.588774,0.543670,0.580568,0.569988,0.634507,0.459236,1.000000,0.545444,0.622358,0.643413
182823,0.613353,0.660264,0.699589,0.486736,0.638353,0.637654,0.633777,0.620639,0.586419,0.625719,...,0.673985,0.667325,0.705063,0.696416,0.627946,0.689295,0.545444,1.000000,0.657153,0.638668
191005,0.445363,0.527112,0.598132,0.469926,0.458462,0.527840,0.560143,0.436115,0.497209,0.494438,...,0.527730,0.546882,0.567105,0.626051,0.526596,0.568151,0.622358,0.657153,1.000000,0.689665


In [None]:
bert_similarities.to_csv("data/bert_similarity_matrix.csv", index=False)

In [80]:
def getRecommendations(userId, dataset):
    ret = []

    history = dataset[dataset["userId"] == userId]
    history = history.sort_values(by='timestamp', ascending=False)

    likedMovies = []
    dislikedMovies = []

    likedMovieIndices = []
    dislikedMovieIndices = []
    
    lr = 5
    dr = 1
    while(len(likedMovies) != 5 or len(dislikedMovies) != 5):
        for index, row in history.iterrows():
            if row["rating"] == lr and len(likedMovies) != 5:
                likedMovies.append(row)
                likedMovieIndices.append(index)
            if row["rating"] == dr and len(dislikedMovies) != 5:
                dislikedMovies.append(row)
                dislikedMovieIndices.append(index)
        lr -= 1
        dr += 1
    
    userpicks = likedMovies + dislikedMovies
    userpicksIndices = likedMovieIndices + dislikedMovieIndices

    #print(userpicks)
    #print(dataset)
    
    for i in range(len(dataset)):
        final_rating = 0
        for m in range(len(userpicksIndices)):
            if(dataset.iloc[i]["userId"] != userId):
                rating = dataset.iloc[i]["rating"]
                if(userpicks[m]["rating"]==4):
                    rating *= 0.5
                elif(userpicks[m]["rating"]==3):
                    rating *= 0
                elif(userpicks[m]["rating"]==2):
                    rating *= -0.5
                elif(userpicks[m]["rating"]==1):
                    rating *= -1
                #print(m["rating"],rating)
                final_rating += rating
            else:
                final_rating = dataset.iloc[i]["rating"] * 5
        ret.append([userId, dataset.iloc[i]["movieId"], final_rating])

    print(ret)
    ret = sorted(ret, key=lambda x: x[2], reverse=True)

    print(type(ret))
    return ret

getRecommendations(1, ratings_train)


[[1, 1, 20.0], [1, 2387, 25.0], [1, 2389, 10.0], [1, 2395, 25.0], [1, 2406, 20.0], [1, 2414, 15.0], [1, 2450, 20.0], [1, 2470, 25.0], [1, 2502, 25.0], [1, 2528, 15.0], [1, 2529, 25.0], [1, 2580, 25.0], [1, 2616, 20.0], [1, 2640, 20.0], [1, 2641, 25.0], [1, 2644, 20.0], [1, 2366, 20.0], [1, 2353, 25.0], [1, 2338, 10.0], [1, 2329, 25.0], [1, 2012, 20.0], [1, 2028, 20.0], [1, 2033, 25.0], [1, 2046, 20.0], [1, 2054, 20.0], [1, 2093, 15.0], [1, 2096, 20.0], [1, 2648, 20.0], [1, 2105, 20.0], [1, 2137, 25.0], [1, 2143, 20.0], [1, 2174, 20.0], [1, 2193, 20.0], [1, 2253, 10.0], [1, 2273, 20.0], [1, 2291, 25.0], [1, 2115, 25.0], [1, 2005, 25.0], [1, 2797, 20.0], [1, 2872, 25.0], [1, 3441, 25.0], [1, 3448, 25.0], [1, 3450, 25.0], [1, 3479, 20.0], [1, 3489, 20.0], [1, 3527, 20.0], [1, 3578, 25.0], [1, 3617, 20.0], [1, 3671, 25.0], [1, 3702, 25.0], [1, 3729, 25.0], [1, 3740, 20.0], [1, 3744, 20.0], [1, 3793, 25.0], [1, 3809, 20.0], [1, 3440, 20.0], [1, 3439, 20.0], [1, 3386, 25.0], [1, 3273, 25.0],

[[1, 2387, 25.0],
 [1, 2395, 25.0],
 [1, 2470, 25.0],
 [1, 2502, 25.0],
 [1, 2529, 25.0],
 [1, 2580, 25.0],
 [1, 2641, 25.0],
 [1, 2353, 25.0],
 [1, 2329, 25.0],
 [1, 2033, 25.0],
 [1, 2137, 25.0],
 [1, 2291, 25.0],
 [1, 2115, 25.0],
 [1, 2005, 25.0],
 [1, 2872, 25.0],
 [1, 3441, 25.0],
 [1, 3448, 25.0],
 [1, 3450, 25.0],
 [1, 3578, 25.0],
 [1, 3671, 25.0],
 [1, 3702, 25.0],
 [1, 3729, 25.0],
 [1, 3793, 25.0],
 [1, 3386, 25.0],
 [1, 3273, 25.0],
 [1, 2899, 25.0],
 [1, 2947, 25.0],
 [1, 2948, 25.0],
 [1, 2949, 25.0],
 [1, 2959, 25.0],
 [1, 2858, 25.0],
 [1, 2991, 25.0],
 [1, 3033, 25.0],
 [1, 3034, 25.0],
 [1, 3253, 25.0],
 [1, 2993, 25.0],
 [1, 3052, 25.0],
 [1, 1954, 25.0],
 [1, 527, 25.0],
 [1, 553, 25.0],
 [1, 596, 25.0],
 [1, 608, 25.0],
 [1, 661, 25.0],
 [1, 1, 20.0],
 [1, 2406, 20.0],
 [1, 2450, 20.0],
 [1, 2616, 20.0],
 [1, 2640, 20.0],
 [1, 2644, 20.0],
 [1, 2366, 20.0],
 [1, 2012, 20.0],
 [1, 2028, 20.0],
 [1, 2046, 20.0],
 [1, 2054, 20.0],
 [1, 2096, 20.0],
 [1, 2648, 20.0],


In [None]:
def testModel(userId):
    ret = []

    history = ratings_test[ratings_test["userId"] == userId]
    history = history.sort_values(by='timestamp', ascending=False)

    likedMovies = []
    dislikedMovies = []

    recommendations = getRecommendations(1)
    
    lr = 5
    dr = 1
    while(len(likedMovies) != 5 or len(dislikedMovies) != 5):
        for index, row in history.iterrows():
            if row["rating"] == lr and len(likedMovies) != 5:
                likedMovies.append(row)
            if row["rating"] == dr and len(dislikedMovies) != 5:
                dislikedMovies.append(row)
        lr -= 1
        dr += 1
    
    userpicks = likedMovies + dislikedMovies

    for i in range(len(history)):
        final_rating = 0
        for m in userpicks:
            
    
    for i in range(len(bert_similarities)):
        final_rating = 0
        for m in userpicks:
            if(m["movieId"] != i):
                rating = bert_similarities.iloc[m["movieId"],i]
                if(m["rating"]==4):
                    rating *= 0.5
                elif(m["rating"]==3):
                    rating *= 0
                elif(m["rating"]==2):
                    rating *= -0.5
                elif(m["rating"]==1):
                    rating *= -1
                print(m["rating"],rating)
                final_rating += rating
        
        ret.append([final_rating, i ])
        break
            

    ret = sorted(ret, key=lambda x: x[0], reverse=True)

    return ret

print(getRecommendations(1))


IndentationError: expected an indented block after 'for' statement on line 27 (1514531141.py, line 30)

In [None]:
!pip install implicit

import pandas as pd
import implicit
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from scipy.sparse import find

# Assuming you have a DataFrame 'ratings_filtered' with columns 'userId', 'movieId', and 'rating'
# and a DataFrame 'bert_similarities' with columns and rows representing movie IDs

# Assuming validation_data is a csr_matrix
def sample_sparse_matrix(sparse_matrix, fraction, random_state=42):
    row, col, data = find(sparse_matrix)
    mask = np.random.rand(len(data)) < fraction
    row_sampled, col_sampled, data_sampled = row[mask], col[mask], data[mask]
    sampled_matrix = csr_matrix((data_sampled, (row_sampled, col_sampled)), shape=sparse_matrix.shape)
    return sampled_matrix


# Create a sparse user-item matrix
sparse_matrix = pd.pivot_table(ratings_filtered, values='rating', index='userId', columns='movieId').fillna(0)
sparse_matrix_csr = csr_matrix(sparse_matrix)

# Ensure that 'bert_similarities' has the same movie IDs as 'sparse_matrix'
common_movie_ids = sparse_matrix.columns.intersection(bert_similarities.columns)
bert_similarities_common = bert_similarities.loc[common_movie_ids, common_movie_ids]

# Create a sparse item-item similarity matrix
item_similarities_csr = csr_matrix(bert_similarities_common.values)

# Ensure that both matrices have the same shape
if sparse_matrix_csr.shape != item_similarities_csr.shape:
    # Resize the matrices to have the same shape
    common_shape = (max(sparse_matrix_csr.shape[0], item_similarities_csr.shape[0]),
                    max(sparse_matrix_csr.shape[1], item_similarities_csr.shape[1]))

    sparse_matrix_csr.resize(common_shape)
    item_similarities_csr.resize(common_shape)

# Combine user-item matrix with item-item similarity matrix
hybrid_matrix = sparse_matrix_csr.dot(item_similarities_csr)

combined_matrix = hybrid_matrix

# Check for NaN values in the combined matrix
if np.isnan(combined_matrix.data).any():
    raise ValueError("NaN values detected in the combined matrix. Please check your data.")

# Split the data into training, validation, and testing sets
train_data, test_data = train_test_split(combined_matrix, test_size=0.2, random_state=42)
validation_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

# Build the ALS (Alternating Least Squares) model
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.01, iterations=50)

# Train the model for 5 epochs
for epoch in range(5):
    # Train the model
    model.fit(train_data)

    # Validate the model on the validation set after each epoch
    validation_data_sample = sample_sparse_matrix(validation_data, fraction=0.001, random_state=42)
    validation_error = validate_model(model, validation_data_sample)
    print(f"Epoch {epoch + 1} - Mean Squared Error on Validation Set: {validation_error}")


# Make recommendations for a user in the test set
user_id = 1
user_items = combined_matrix.T.tocsr()[user_id - 1]  # Select the row corresponding to the user
recommendations = model.recommend(user_id - 1, user_items=user_items)

# Print the recommended movie IDs
print(recommendations)






[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'ratings_filtered' is not defined

In [None]:
import implicit
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
import pandas as pd

# Assuming you have a DataFrame 'ratings_train' with columns 'userId', 'movieId', and 'rating'
# You might need to adapt this code to match your actual DataFrame structure

# Create a sparse user-item matrix for training
sparse_matrix_train = pd.pivot_table(ratings_train, values='rating', index='userId', columns='movieId').fillna(0)
sparse_matrix_train_csr = csr_matrix(sparse_matrix_train)

# Create a sparse user-item matrix for testing
sparse_matrix_test = pd.pivot_table(ratings_test, values='rating', index='userId', columns='movieId').fillna(0)
sparse_matrix_test_csr = csr_matrix(sparse_matrix_test)

# Build the ALS (Alternating Least Squares) model
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.01)

# Specify the number of epochs
num_epochs = 10

for epoch in range(num_epochs):
    # Train the model for each epoch
    model.fit(sparse_matrix_train_csr)

    # Test the model on the test set after each epoch
    test_error = test_model(model, ratings_test)
    print(f"Mean Squared Error on Test Set after Epoch {epoch + 1}: {test_error}")


100%|██████████| 15/15 [00:01<00:00,  9.97it/s]


Mean Squared Error on Test Set after Epoch 1: 5811843.9402506


100%|██████████| 15/15 [00:01<00:00,  9.79it/s]


Mean Squared Error on Test Set after Epoch 2: 5494336.713782969


100%|██████████| 15/15 [00:01<00:00,  9.57it/s]


Mean Squared Error on Test Set after Epoch 3: 5178254.913798437


100%|██████████| 15/15 [00:01<00:00,  9.05it/s]


Mean Squared Error on Test Set after Epoch 4: 5283539.39527419


100%|██████████| 15/15 [00:01<00:00,  9.42it/s]


Mean Squared Error on Test Set after Epoch 5: 5269464.821602599


100%|██████████| 15/15 [00:01<00:00,  9.28it/s]


Mean Squared Error on Test Set after Epoch 6: 5298615.291244489


100%|██████████| 15/15 [00:01<00:00,  9.57it/s]


Mean Squared Error on Test Set after Epoch 7: 5110666.738456184


100%|██████████| 15/15 [00:01<00:00,  9.26it/s]


Mean Squared Error on Test Set after Epoch 8: 5129098.682612731


100%|██████████| 15/15 [00:01<00:00,  9.31it/s]


Mean Squared Error on Test Set after Epoch 9: 5141559.693518447


100%|██████████| 15/15 [00:01<00:00,  9.06it/s]


Mean Squared Error on Test Set after Epoch 10: 5206167.236097146


In [None]:
import implicit
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
import pandas as pd

# Assuming you have a DataFrame 'ratings_train' with columns 'userId', 'movieId', and 'rating'
# You might need to adapt this code to match your actual DataFrame structure

# Create a sparse user-item matrix for training
sparse_matrix_train = pd.pivot_table(ratings_train, values='rating', index='userId', columns='movieId').fillna(0)
sparse_matrix_train_csr = csr_matrix(sparse_matrix_train)

# Create a sparse user-item matrix for testing
sparse_matrix_test = pd.pivot_table(ratings_test, values='rating', index='userId', columns='movieId').fillna(0)
sparse_matrix_test_csr = csr_matrix(sparse_matrix_test)

# Create a sparse user-item matrix for validation
sparse_matrix_validation = pd.pivot_table(ratings_validation, values='rating', index='userId', columns='movieId').fillna(0)
sparse_matrix_validation_csr = csr_matrix(sparse_matrix_validation)

# Build the ALS (Alternating Least Squares) model
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.01, iterations=50)

# Number of epochs
num_epochs = 10

for epoch in range(num_epochs):
    # Train the model
    model.fit(sparse_matrix_train_csr)

    # Validate the model on the validation set
    validation_error = validate_model(model, ratings_validation)
    print(f"Epoch {epoch + 1}/{num_epochs} - Mean Squared Error on Validation Set: {validation_error}")


100%|██████████| 50/50 [00:05<00:00,  9.64it/s]


Epoch 1/10 - Mean Squared Error on Validation Set: 5344071.289794261


100%|██████████| 50/50 [00:05<00:00,  9.54it/s]


Epoch 2/10 - Mean Squared Error on Validation Set: 4946737.0765527105


100%|██████████| 50/50 [00:05<00:00,  9.49it/s]


Epoch 3/10 - Mean Squared Error on Validation Set: 4821031.435358496


100%|██████████| 50/50 [00:05<00:00,  9.40it/s]


Epoch 4/10 - Mean Squared Error on Validation Set: 4919157.194504602


100%|██████████| 50/50 [00:05<00:00,  9.32it/s]


Epoch 5/10 - Mean Squared Error on Validation Set: 4936587.577055457


100%|██████████| 50/50 [00:05<00:00,  9.15it/s]


Epoch 6/10 - Mean Squared Error on Validation Set: 5001446.457247274


100%|██████████| 50/50 [00:05<00:00,  9.02it/s]


Epoch 7/10 - Mean Squared Error on Validation Set: 4982824.355383247


100%|██████████| 50/50 [00:05<00:00,  9.30it/s]


Epoch 8/10 - Mean Squared Error on Validation Set: 4979062.903530822


100%|██████████| 50/50 [00:05<00:00,  9.24it/s]


Epoch 9/10 - Mean Squared Error on Validation Set: 4983010.888835177


100%|██████████| 50/50 [00:05<00:00,  9.09it/s]


Epoch 10/10 - Mean Squared Error on Validation Set: 4980796.117855209


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Assuming bert_similarities is a DataFrame with similarity scores
# and user_ratings is a dictionary where keys are users and values are dictionaries of movie ratings

# Convert DataFrame to NumPy array
bert_similarities_array = bert_similarities.values

# Create a list of unique movies
movies = sorted(list(set(bert_similarities_array.flatten())))

# Create a user-item matrix
user_item_matrix = np.zeros((len(user_ratings), len(movies)))

for i, user in enumerate(user_ratings):
    for j, movie in enumerate(movies):
        if movie in user_ratings[user]:
            # Combine rating and similarity score (you can adjust the combination method)
            user_item_matrix[i, j] = user_ratings[user][movie] * bert_similarities_array[movie_index(movie), movie_index(movie)]

# Normalize the matrix if needed
user_item_matrix_normalized = user_item_matrix / user_item_matrix.max()

# Assuming target is a binary target variable indicating whether the user likes the movie (1) or not (0)
# You need to replace this with your actual target variable.

# Generate random target variable for the example
target = np.random.randint(2, size=user_item_matrix_normalized.shape[0])

# Split the data into training, validation, and test sets
split1 = int(0.7 * user_item_matrix_normalized.shape[0])
split2 = int(0.85 * user_item_matrix_normalized.shape[0])

X_train, y_train = user_item_matrix_normalized[:split1], target[:split1]
X_val, y_val = user_item_matrix_normalized[split1:split2], target[split1:split2]
X_test, y_test = user_item_matrix_normalized[split2:], target[split2:]

# Convert to sparse matrix
X_train_sparse = csr_matrix(X_train)
X_val_sparse = csr_matrix(X_val)
X_test_sparse = csr_matrix(X_test)
# Sigmoid activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Derivative of the sigmoid function
def sigmoid_derivative(x):
    return x * (1 - x)

# Initialize weights and biases
input_size = user_item_matrix.shape[1]
hidden_size = 64
output_size = 1

np.random.seed(42)

weights_input_hidden = np.random.rand(input_size, hidden_size)
weights_hidden_output = np.random.rand(hidden_size, output_size)

biases_hidden = np.zeros((1, hidden_size))
biases_output = np.zeros((1, output_size))

# Hyperparameters
learning_rate = 0.01
epochs = 10

# Training
for epoch in range(epochs):
    # Forward pass
    hidden_layer_input = np.dot(X_train, weights_input_hidden) + biases_hidden
    hidden_layer_output = sigmoid(hidden_layer_input)

    output_layer_input = np.dot(hidden_layer_output, weights_hidden_output) + biases_output
    predicted_output = sigmoid(output_layer_input)

    # Calculate loss
    loss = -np.mean(y_train * np.log(predicted_output) + (1 - y_train) * np.log(1 - predicted_output))

    # Backward pass
    output_error = y_train.reshape(-1, 1) - predicted_output
    output_delta = output_error * sigmoid_derivative(predicted_output)

    hidden_layer_error = output_delta.dot(weights_hidden_output.T)
    hidden_layer_delta = hidden_layer_error * sigmoid_derivative(hidden_layer_output)

    # Update weights and biases
    weights_hidden_output += hidden_layer_output.T.dot(output_delta) * learning_rate
    weights_input_hidden += X_train.T.dot(hidden_layer_delta) * learning_rate

    biases_output += np.sum(output_delta, axis=0, keepdims=True) * learning_rate
    biases_hidden += np.sum(hidden_layer_delta, axis=0, keepdims=True) * learning_rate

    # Validation accuracy
    hidden_layer_val = sigmoid(np.dot(X_val, weights_input_hidden) + biases_hidden)
    predicted_val = sigmoid(np.dot(hidden_layer_val, weights_hidden_output) + biases_output)

    val_accuracy = np.mean((predicted_val > 0.5) == y_val.reshape(-1, 1))

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss}, Validation Accuracy: {val_accuracy}')

# Test accuracy
hidden_layer_test = sigmoid(np.dot(X_test, weights_input_hidden) + biases_hidden)
predicted_test = sigmoid(np.dot(hidden_layer_test, weights_hidden_output) + biases_output)

test_accuracy = np.mean((predicted_test > 0.5) == y_test.reshape(-1, 1))
print(f'Test Accuracy: {test_accuracy}')

MemoryError: Unable to allocate 30.5 GiB for an array with shape (751, 5446130) and data type float64

In [137]:
def getRecommendations(userId):
    ret = []

    history = ratings_train[ratings_train["userId"] == userId]
    history = history.sort_values(by='timestamp', ascending=False)

    likedMovies = []
    dislikedMovies = []
    
    lr = 5
    dr = 1
    # while(len(likedMovies) != 10 or len(dislikedMovies) != 10):
    #     for index, row in history.iterrows():
    #         if row["rating"] == lr and len(likedMovies) != 10:
    #             likedMovies.append(row)
    #         if row["rating"] == dr and len(dislikedMovies) != 10:
    #             dislikedMovies.append(row)
    #     lr -= 1
    #     dr += 1
    
    #userpicks = likedMovies + dislikedMovies
    userpicks = []
    for index, row in history.iterrows():
        userpicks.append(row)
    
    for i in range(len(bert_similarities)):
        final_rating = 0
        if i in history["movieId"].values:
            continue
        for m in userpicks:
            rating = bert_similarities.iloc[m["movieId"],i]
            if(m["rating"]==5):
                rating *= 15
            if(m["rating"]==4):
                rating *= 3
            elif(m["rating"]==3):
                rating *= 1
            elif(m["rating"]==2):
                rating *= -1
            elif(m["rating"]==1):
                rating *= -5
            #print(m["rating"],rating)
            final_rating += rating
        
        ret.append([final_rating, i ])
            

    ret = sorted(ret, key=lambda x: x[0], reverse=True)

    return ret

recommendations = getRecommendations(1)

In [82]:
file_path = 'data/bert_similarity_matrix.csv'

# Read the CSV file into a DataFrame
bert_similarities = pd.read_csv(file_path)

bert_similarities.index = bert_similarities.index + 1


# Display the DataFrame
print(bert_similarities)

             1         2         3         4         5         6         7  \
1     1.000000  0.648643  0.605092  0.496094  0.773694  0.429486  0.654809   
2     0.648643  1.000000  0.674596  0.444419  0.659802  0.636512  0.652176   
3     0.605092  0.674596  1.000000  0.538836  0.715977  0.567193  0.815671   
4     0.496094  0.444419  0.538836  1.000000  0.612757  0.287217  0.609637   
5     0.773694  0.659802  0.715977  0.612757  1.000000  0.460315  0.709289   
...        ...       ...       ...       ...       ...       ...       ...   
4855  0.507768  0.566365  0.592974  0.420877  0.568341  0.560228  0.540970   
4856  0.532128  0.481389  0.606039  0.596081  0.612401  0.495233  0.661764   
4857  0.613353  0.660264  0.699589  0.486736  0.638353  0.637654  0.633777   
4858  0.445363  0.527112  0.598132  0.469926  0.458462  0.527840  0.560143   
4859  0.475027  0.573363  0.579098  0.572693  0.518412  0.611810  0.638153   

             8         9        10  ...    180095    180231    

In [138]:
def find_position(movie_data, target_movie_id):
    for position, data in enumerate(movie_data):
        if data[1] == target_movie_id:
            return position  # Return position, latitude, and longitude

    return None
#print(len(recommendations))

history = ratings_test[(ratings_test["userId"] == 1) & (ratings_test["rating"] == 5)]

#print(history)
positions = []

unique_movie_ids = history['movieId'].unique()

for movie_id in unique_movie_ids:
    position = find_position(recommendations, movie_id)
    positions.append((movie_id, position))


print(positions)

only_postions = [position for movie_id, position in positions]

print(only_postions)

g_counter = 0
b_counter = 0

for p in only_postions:
    if p > len(recommendations) / 2:
        b_counter += 1
    else:
        g_counter += 1

print(g_counter)
print(b_counter)



[(1029, 1226), (101, 87), (151, 2607), (157, 3730), (1031, 813), (163, 464), (333, 4645), (216, 2084), (1032, 2640), (954, 2734), (1517, 3135), (1291, 260), (1573, 3461), (1617, 4124), (1927, 1098), (1587, 3564), (1278, 1687), (1270, 2842), (1073, 3787), (1080, 3331), (1089, 4693), (1275, 2008), (1097, 1443), (1136, 2020), (1092, 3977), (1213, 2049), (1222, 3145), (1224, 2007), (1256, 2743)]
[1226, 87, 2607, 3730, 813, 464, 4645, 2084, 2640, 2734, 3135, 260, 3461, 4124, 1098, 3564, 1687, 2842, 3787, 3331, 4693, 2008, 1443, 2020, 3977, 2049, 3145, 2007, 2743]
13
16


In [140]:
success = 0
for i in range(10):
    rec = getRecommendations(i)
    history = ratings_test[(ratings_test["userId"] == 1) & (ratings_test["rating"] == 5)]
    
    positions = []

    unique_movie_ids = history['movieId'].unique()

    for movie_id in unique_movie_ids:
        position = find_position(rec, movie_id)
        positions.append((movie_id, position))

    only_postions = [position for movie_id, position in positions]

    g_counter = 0
    b_counter = 0

    for p in only_postions:
        if p > len(recommendations) / 2:
            b_counter += 1
        else:
            g_counter += 1

    if(g_counter >= b_counter):
        success += 1

    print(g_counter)
    print(b_counter)
    
print(success)



KeyboardInterrupt: 

In [161]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate

# Sample dataFrames (replace these with your actual data)
ratings_df = pd.DataFrame({
    'userId': [1, 1, 2, 2, 3],
    'movieId': [101, 102, 101, 103, 102],
    'rating': [4, 5, 3, 4, 5]
})

# Assuming similarity_matrix is your movie similarity matrix
similarity_matrix = np.array([
    [1.0, 0.8, 0.5],
    [0.8, 1.0, 0.7],
    [0.5, 0.7, 1.0]
])

# Merge ratings and similarity dataFrames
#ratings_filtered = ratings_filtered.drop(columns=['timestamp'])

merged_df = pd.merge(ratings_filtered, pd.DataFrame(bert_similarities), left_on='movieId', right_index=True)

# Split the data into train and test sets
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)
unique_movie_ids = train_df['movieId'].unique()

# Create a mapping from original movieId to contiguous integers
movie_id_mapping = {movie_id: idx for idx, movie_id in enumerate(unique_movie_ids)}

# Map the 'movieId' column to contiguous integers
train_df['movieId'] = train_df['movieId'].map(movie_id_mapping)

# Similarly, apply the mapping to the test set
test_df['movieId'] = test_df['movieId'].map(movie_id_mapping)

# Preprocess the data
scaler = StandardScaler()
train_df['rating'] = scaler.fit_transform(train_df[['rating']])
test_df['rating'] = scaler.transform(test_df[['rating']])

# Neural network model
user_input = Input(shape=(1,), name='user_input')
movie_input = Input(shape=(1,), name='movie_input')
similarity_input = Input(shape=(len(bert_similarities),), name='similarity_input')

# Use Embedding layers with input_dim set to the number of unique values
user_embedding = Embedding(input_dim=train_df['userId'].nunique(), output_dim=10)(user_input)
movie_embedding = Embedding(input_dim=train_df['movieId'].nunique(), output_dim=10)(movie_input)

user_flatten = Flatten()(user_embedding)
movie_flatten = Flatten()(movie_embedding)

concatenated = Concatenate()([user_flatten, movie_flatten, similarity_input])
dense1 = Dense(128, activation='relu')(concatenated)
dense2 = Dense(64, activation='relu')(dense1)
output = Dense(1, activation='linear', name='output')(dense2)

model = Model(inputs=[user_input, movie_input, similarity_input], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit([train_df['userId'], train_df['movieId'], np.array(train_df.iloc[:, 3:])], train_df['rating'], epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
mse = model.evaluate([test_df['userId'], test_df['movieId'], np.array(test_df.iloc[:, 3:])], test_df['rating'])
print(f'Mean Squared Error on Test Set: {mse}')

Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node model_16/embedding_32/embedding_lookup defined at (most recent call last):
  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\traitlets\config\application.py", line 1077, in launch_instance

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 595, in run_forever

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 1881, in _run_once

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\asyncio\events.py", line 80, in _run

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 529, in dispatch_queue

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 518, in process_one

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 424, in dispatch_shell

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 766, in execute_request

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\ipykernel\ipkernel.py", line 429, in do_execute

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3048, in run_cell

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3103, in _run_cell

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3308, in run_cell_async

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3490, in run_ast_nodes

  File "C:\Users\chris\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3550, in run_code

  File "C:\Users\chris\AppData\Local\Temp\ipykernel_13616\143158023.py", line 66, in <module>

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1807, in fit

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1401, in train_function

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1384, in step_function

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1373, in run_step

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1150, in train_step

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 590, in __call__

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\base_layer.py", line 1149, in __call__

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\functional.py", line 515, in call

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\functional.py", line 672, in _run_internal_graph

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\base_layer.py", line 1149, in __call__

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "c:\Users\chris\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\layers\core\embedding.py", line 272, in call

indices[18,0] = 605 is not in [0, 605)
	 [[{{node model_16/embedding_32/embedding_lookup}}]] [Op:__inference_train_function_17567]