# Advanced Information Retrieval - Project

### Organizational infos:
* 30 Points in total
* Deadline: 09.01.2023 23:59

In [None]:
#imports

import pandas as pd
import numpy as np
import random
import ast
import gensim
from torch import nn
import torch
from nltk.tokenize import WhitespaceTokenizer
from scipy.stats import pearsonr
from thefuzz import fuzz
from thefuzz import process

## Movie struct
Create a struct that contains all  the movie relevant Information to work with.
Could be changed at any given time.


**Define all relevante class attributes**

In [None]:
class Movie:
    def __init__(self, id, name, genres):
        self.id = id
        self.name = name
        self.genres = genres
        self.tagsRelevance = [0] * 1128
        self.tagsList = []
        self.ratingCount = 0
        self.ratingSum = 0
        self.rating = 0

    def SetTagRelevanceVector(self, tagsRelevance):
        self.tagsRelevance = tagsRelevance

    def SetTagsList(self, tagsList):
        self.tagsList = tagsList
    
    def SetSumRatings(self, userRatings):
        self.rating = userRatings
    
    def AddRatingCount(self, userRating):
        self.ratingCount += 1
        self.ratingSum += userRating
        self.rating = self.ratingSum / self.ratingCount
    

**Create A List of Movies and a look up Table for the Genres for later usage.**

The Genres could also be used for a similarity measurement so i just saved the just in case.
Also i am not to sure if genre-14 (Imax) should really be inluded. 

In [None]:

def CreateMovieList(lookUpGenre):
    allMovies = {}
    moviedocs = pd.read_csv('movies.csv')
    lookUpIndex = 1

    for i in range(len(moviedocs)):
        movieGenres = moviedocs['genres'][i]
        movieGenresList = movieGenres.split('|')
        movieGenresNumericalList = []
        for word in movieGenresList:
            if word not in lookUpGenre:
                lookUpGenre[word] = lookUpIndex
                lookUpIndex += 1

            movieGenresNumericalList.append(lookUpGenre[word])
        movie = Movie(moviedocs['movieId'][i], moviedocs['title'][i], movieGenresNumericalList)

        allMovies[movie.id] = movie
    return allMovies
    

**Adds the Genom Score relevances to the Movies**

The genom score is the relevances the tags have to the movies. We also have a csv file mapping the tags to an index.
So i created a tags look up table.

In [None]:
def CreateTagsLookUpTable():
    genres = pd.read_csv('genome-tags.csv')
    tagsLookUpTable = {}
    for index in range(len(genres)):
        tagsLookUpTable[genres['tagId'][index]] = genres['tag'][index]
        index += 1
    return tagsLookUpTable

tags_lookup_table = CreateTagsLookUpTable()


def AddGenoScoresToMovies(movieList):
    genomeScores = pd.read_csv('genome-scores.csv')
    movieTagsRelevance = []
    tags = []
    movieId = genomeScores['movieId'][0]
    for i in range(len(genomeScores)):
        if genomeScores['movieId'][i] != movieId:
            movieList[movieId].SetTagRelevanceVector(movieTagsRelevance.copy())
            movieList[movieId].SetTagsList(tags.copy())
            movieId = genomeScores['movieId'][i]
            movieTagsRelevance.clear()
            tags.clear()

        tags.append(tags_lookup_table[genomeScores["tagId"][i]])
        movieTagsRelevance.append(genomeScores['relevance'][i])
    movieList[movieId].SetTagRelevanceVector(movieTagsRelevance)
    return


genomeScores = pd.read_csv('genome-scores.csv')
print(genomeScores['movieId'][0])
print(genomeScores['relevance'][0])

**Using all the Functions above**


well it's not takes a lot of time to add all the relevance scores...

In [None]:
lookUpTableGenres = {}
movies = CreateMovieList(lookUpTableGenres)
AddGenoScoresToMovies(movies)
lookUpTags = CreateTagsLookUpTable()



**Use the tags as a similarity measurement vector.**

Get a sorted List with a given size depending on the count. Uses the Cosine similiarity or Jaccard similarity to calculate the tags similarity between movies.

In [None]:
print(movies[1].tagsList)
def calc_cosine(movieOne, movieTwo):

    d_1 = 0
    d_2 = 0
    numerator = 0
    for index in range(len(movieOne)):
        numerator += movieOne[index] * movieTwo[index]
        d_1 += movieOne[index] * movieOne[index]
        d_2 += movieTwo[index] * movieTwo[index]

    denominator = np.sqrt(d_1) * np.sqrt(d_2)
    res = numerator / denominator
    return res


def jaccard_sim(movie1,movie2):
    number_both=set(movie1).intersection(set(movie2))
    number_oberservations_in_either = len(movie1)+len(movie2)-len(number_both)
    similarity= float(len(number_both)) / number_oberservations_in_either
    return similarity


def GetTopTagsCosineSims(count, movies, query):

    #empty
    if query.tagsRelevance.count(0) == 1128:
        return -1
        
    smiliarMovies = {}
    simValues = []
    for mID in movies:
        if movies[mID].name == query.name or movies[mID].tagsRelevance.count(0) == 1128:
            continue
        sim = calc_cosine(movies[mID].tagsRelevance, query.tagsRelevance)
        simValues.append(sim)
        smiliarMovies[sim] = movies[mID].name

    TopSimilar = {}


    for i in range(count):
        v = max(simValues)
        TopSimilar[v] = smiliarMovies[v]
        simValues.remove(v)
    return TopSimilar

def get_top_tags_jaccard_sims(count, movies, query):

    if query.tagsRelevance.count(0) == 1128:
        return -1

    smiliar_movies = {}
    sim_values = []
    for mID in movies:
        if movies[mID].name == query.name or movies[mID].tagsRelevance.count(0) == 1128:
            continue
        sim = jaccard_sim(movies[mID].tagsRelevance, query.tagsRelevance)
        sim_values.append(sim)
        smiliar_movies[sim] = movies[mID].name

    top_similar = {}

    sim_values.sort(reverse=True)

    for i in range(count):
        top_similar[sim_values[i]] = smiliar_movies[sim_values[i]]
    return top_similar


def PrintSimilarity(similarMovies):
    if similarMovies != -1:
        #print("Query: ", query.name)
        print("similar Movies: ")
        count = 1
        for sim in similarMovies:
            print(count, similarMovies[sim] + ":", sim)
            count += 1
    else:
        print("Query deosn't have tags!")


Get a sorted List with a given size depending on the count. Uses the Pearson Correlation Coefficient to calculate the tags similarity between movies.

In [None]:
def calcPearsonCoefficient(movie, query):
    correlation, _ = pearsonr(movie, query)
    return correlation

def GetTopTagsPearsonCorrelation(count, movies, query):

    if query.tagsRelevance.count(0) == 1128:
        return -1

    similarity = dict()
    for mID in movies:
        if movies[mID].name == query.name or movies[mID].tagsRelevance.count(0) == 1128:
            continue
        sim = calcPearsonCoefficient(movies[mID].tagsRelevance, query.tagsRelevance)
        similarity[movies[mID].name] = sim

    top_movies = dict()
    for key in sorted(similarity, key=similarity.get, reverse=True)[:count]:
        top_movies[key] = similarity[key]

    return top_movies

def PrintPearsonCorrelation(similarMovies):
    if similarMovies != -1:
        print("\nQuery: ", query.name)
        print("similar Movies according to Pearson Correlation Coefficient: ")
        for idx, movie_key in enumerate(similarMovies):
            print(idx + 1, str(movie_key) + " : " + str(similarMovies[movie_key]))
    else:
        print("Query deosn't have tags!")

**Create a query and get/prints the movies with similar tags**

Uses the functions above to print the query result.

In [None]:
query = movies[4896]
count = 10

similarMovies = GetTopTagsCosineSims(count, movies, query)
PrintSimilarity(similarMovies)

similar_movies_with_jaccard = get_top_tags_jaccard_sims(count, movies, query)
PrintSimilarity(similar_movies_with_jaccard)

similarPearsonMovies = GetTopTagsPearsonCorrelation(count, movies, query)
PrintPearsonCorrelation(similarPearsonMovies)


# Fuzzy Title Matching - Levenshtein

Fuzzy matches the given title against all movie titles using the levenshtein distance

In [None]:
def fuzzyMatch(movies, user_title):
    movie_titles = [movies[mID].name for mID in movies]
    return process.extractOne(user_title, movie_titles, scorer=fuzz.partial_ratio)


def GetMovieByName(searchQuery):
    query = fuzzyMatch(movies, searchQuery)
    searchedMovie = movies[1]
    for movie in movies:
        if movies[movie].name == query[0]:
            searchedMovie = movies[movie]
            break
    return searchedMovie

matched_title = fuzzyMatch(movies, "Serch fo Spick")
print(matched_title)
print(GetMovieByName("pider human").name)

# Fuzzy Title Matching - TF-IDF w. Cosine Similarity
Fuzzy matches the given title against all movies title using tf-idf and the cosine similarity measure

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def getTitleTokens(title):
    title = title.lower()
    return [title[idx] + title[idx + 1] for idx in range(len(title) - 1)]

def getAllDistinctTitleTokens(titles):
    title_tokens = set()
    for title in titles:
        title_tokens.update(getTitleTokens(title))
    return sorted(title_tokens)

def getTF(titles, title_tokens):
    tf_dict = dict.fromkeys(title_tokens, 0)
    for title in titles:
        for token in set(getTitleTokens(title)):
            tf_dict[token] += 1
    return tf_dict

def getIDF(termFrequencyDict, num_titles, title_tokens):
    idf_dict = dict.fromkeys(title_tokens, 0)
    for token in title_tokens:
        idf_dict[token] = np.log10(num_titles / (termFrequencyDict[token]))
    return idf_dict

def getTFIDF(title, title_tokens, idf_dict):
    tfidf_dict = dict.fromkeys(title_tokens, 0)
    for token in set(getTitleTokens(title)):
        tfidf_dict[token] += idf_dict[token]
    return tfidf_dict

def getMostSimilarTitles(similarty_dict, top_num):
    top_titles = dict()
    for key in sorted(similarty_dict, key=similarty_dict.get, reverse=True)[:top_num]:
        top_titles[key] = similarty_dict[key]
    return top_titles

def cosineSimilarity(vector1, vector2):
    d_1 = 0
    d_2 = 0
    numerator = 0
    for v1, v2 in zip(vector1, vector2):
        numerator += v1 * v2
        d_1 += v1 ** 2
        d_2 += v2 ** 2

    d1_euclid = np.sqrt(d_1)
    d2_euclid = np.sqrt(d_2)
    denominator = (d1_euclid * d2_euclid) if (d1_euclid * d2_euclid) >= 0 else -(d1_euclid * d2_euclid)
    return numerator / denominator
    

def fuzzyMatchTFIDF(user_title, titles):
    titles = set([movies[mID].name for mID in movies] + [user_title])
    title_tokens = getAllDistinctTitleTokens(titles)

    tf_dict = getTF(titles, title_tokens)
    idf_dict = getIDF(tf_dict, len(titles), title_tokens)
    
    user_title_dict = dict.fromkeys(title_tokens, 0)
    for token in getTitleTokens(user_title):
        user_title_dict[token] += idf_dict[token]

    current_title_dict = dict.fromkeys(title_tokens, 0)
    titles.remove(user_title)
    similarity_dict = dict()
    for title in titles:
        current_title_dict = getTFIDF(title, title_tokens, idf_dict)
        similarity_dict[title] = cosine_similarity([list(current_title_dict.values())], [list(user_title_dict.values())])[0][0]

    return similarity_dict

similarity_dict = fuzzyMatchTFIDF("Ded and living", movies)
top_similar_titles = getMostSimilarTitles(similarity_dict, 10)
print(top_similar_titles)


# User Struct

A struct that contains the user ratings and also the tags the user choosed for a movie.

In [None]:
class User:
    def __init__(self, id):
        self.id = id
        self.movieRatings = {}
        self.movieTags = {}
        self.genomMovieTags = {}
    
    def AddRating(self, movie, rating):
        self.movieRatings[movie] = rating
    
    def AddTags(self, movie, tag):
        if movie not in self.movieTags:
            self.movieTags[movie] = []
        self.movieTags[movie].append(tag)

    def UpdateGenomTags(self, lookUpTableGenom):
        for movie in self.movieTags:
            inside = list()
            for tag in self.movieTags[movie]:
                for values in lookUpTags:
                    if lookUpTags[values] == tag:
                        inside.append(values)
                        
            self.genomMovieTags[movie] = inside


**Creates a list in User with every movie ranked by the User the calculates the generel movie ratings**

takes 5 minutes...

In [None]:
# userId,movieId,rating,timestamp
def CreateUsers(allMovies):
    ratings = pd.read_csv('ratings.csv')
    UserDic = {}
    userId = -1

    for i in range(len(ratings)):
        movie = allMovies[ratings['movieId'][i]]
        userID = ratings['userId'][i]
        ratingsRow = ratings['rating'][i]

        if userID != userId:
            userId = userID
            newUser = User(userId)      
            UserDic[newUser.id] = newUser

        newUser.AddRating(movie, ratingsRow)
        movie.AddRatingCount(ratingsRow)
    return UserDic

users = CreateUsers(movies)


In [None]:
# 7045,4896,4.0
print(movies[4896].name)
print("Movie Rating:", movies[4896].rating)
print("Random User Rating: ", users[7045].movieRatings[movies[4896]])


**Addind tags that user rated on movies (not every rated movie has a tag vice versa)**

In [None]:
#userId,movieId,tag,timestamp
def UserDefinedTag(allUsers, movies):
    userTags = pd.read_csv('tags.csv')

    activeUser = allUsers[userTags['userId'][0]]
    
    movie = movies[userTags['movieId'][0]]
    for i in range(len(userTags)):
        if activeUser.id != userTags['userId'][i]:
            activeUser = allUsers[userTags['userId'][i]]

        if movie.id != userTags['movieId'][i]:
            movie = movies[userTags['movieId'][i]]

        activeUser.AddTags(movie, userTags['tag'][i])
       
UserDefinedTag(users, movies) 


**Adding the Genom tags in numerical form**

In [None]:
def SetGenomTags(allUsers):
    for userId in allUsers:
        allUsers[userId].UpdateGenomTags(lookUpTags)

SetGenomTags(users)

# !!! Important !!!

**There are some Userdefined tags that arent in the 1128 tags** Example [Hayao Miyazaki,Miyazaki]

Should we create a new Table for all Tags? Or should we just change them into numbers and only work with the 1128 table?

In [None]:
# print("Movie:", movies[7099].name)

def CheckIfInLoopUpTable(genresToCheck):
    inside = list()
    keys = []
    for item in genresToCheck:
        for values in lookUpTags:
            if lookUpTags[values] == item:
                inside.append(item)
                keys.append(values)

    outside = list(set(genresToCheck) - set(inside))
    print("Genres:", genresToCheck)
    print("Inside:", inside)
    print("Inside (numeric):", keys)
    print("Not inside:", outside)
    return inside

CheckIfInLoopUpTable(list(users[19].movieTags[movies[7099]]))
print("Genom Tags(inside):", users[19].genomMovieTags[movies[7099]])

# TODOS
* search depending on tags
* search depending on title similarity with than searches for a similar movie
* seach query for getting ranking based best movie in specific genre (There are 19 genres (readme)!!)
* beim ML teil können wir die filme die ein User gerated hat und ein genom tag haben aufteilen in test und training und predicten wie der score sein soll
* jaccard similarity machen mit tags (brauchen die tags zu den einzelnen filmen) und dann 0 oder 1.
* Powerpoint presentation machen

Tags class and lookup table for tag name and id

In [None]:

class Tags:
    def __init__(self, tag_id, tag_name):
        self.tags_relevance = [0] * 1128
        self.tag_id = tag_id
        self.tag_name = tag_name

    def set_tag_relevance_vector(self, tags_relevance):
        self.tags_relevance = tags_relevance



def create_tag_list():
    tags_dict = {}
    genome_tags = pd.read_csv('genome-tags.csv')

    for i in range(len(genome_tags)):

        tag = Tags(genome_tags['tagId'][i], genome_tags['tag'][i])
        tags_dict[tag.tag_name] = tag

    return tags_dict


def create_tags_id_lookup_table():
    genres = pd.read_csv('genome-tags.csv')
    tags_lookup_table = {}
    for index in range(len(genres)):
        tags_lookup_table[genres['tag'][index]] = genres['tagId'][index]
        index += 1
    return tags_lookup_table


lookup_table_tags = create_tags_id_lookup_table()
tags_list = create_tag_list()




In [None]:
def user_tag_search(tags_test, movies, count):

    tags_counter_vector = [0] * 1128

    for tag in tags_test:
        tags_counter_vector[lookup_table_tags[tag]] = 1

    similar_movies = {}
    sim_values = []
    for mID in movies:
        if movies[mID].tagsRelevance.count(0) == 1128:
            continue
        sim = calc_cosine(movies[mID].tagsRelevance, tags_counter_vector)
        sim_values.append(sim)
        similar_movies[sim] = movies[mID].name

    top_similar = {}
    sim_values.sort(reverse=True)

    for i in range(count):
        top_similar[sim_values[i]] = similar_movies[sim_values[i]]
    return top_similar



User can use the function user_tag_search to insert genome tags and get movie recommendations as a result

In [None]:

user_tag_result = user_tag_search(["action", "action packed"], movies, 10)
PrintSimilarity(user_tag_result)

**Getting top Movies rated movies in specified genre can set a min user rating counter**

In [None]:
def GetMoviesWithGenre(movies, querys):
    lenght = len(querys)
    queryIds = [lookUpTableGenres[item] for item in querys]
    moviesWithOnly = []
    skipMovie = False
    
    for movie in movies:
        for id in queryIds:
            if id not in movies[movie].genres or len(movies[movie].genres) != lenght:
                skipMovie = True
                break
        if not skipMovie:
            moviesWithOnly.append(movies[movie])
        skipMovie = False
    return moviesWithOnly


def GetHighestRanking(count, minRatingCount, genres):
    movie_titles = [key for key in lookUpTableGenres]

    queryGenre = []
    for genre in genres:
        titelSearch = process.extractOne(genre, movie_titles, scorer=fuzz.partial_ratio)
        if titelSearch[1] < 80:
            continue
        queryGenre.append(titelSearch[0])

    if len(queryGenre) == 0 or '(no genres listed)' in queryGenre:
        queryGenre = ['(no genres listed)']

    prestr = "["
    for index in range(len(queryGenre)):
        prestr += f"{queryGenre[index]}({lookUpTableGenres[queryGenre[index]]})"
        if index + 1 < len(queryGenre):
            prestr += ", "
    prestr += "]"

    print(f"Searching for the {count} best {prestr} movies with at least {minRatingCount} User-ratings:")
    simGenreMovies = GetMoviesWithGenre(movies, queryGenre)
    allRankings = {}
    ratingsList = []

    for movie in simGenreMovies:
        if movie.ratingCount < minRatingCount:
            continue
        if movie.rating not in allRankings:
            allRankings[movie.rating] = []
            if movie.rating not in ratingsList:
                ratingsList.append(movie.rating)
        allRankings[movie.rating].append(movie)

    ratingsList.sort(reverse=True)
    returnRanking = []
    while len(returnRanking) < count and len(ratingsList) != 0:
        value = max(ratingsList)
        index = 0
        for movie in allRankings[value]:
            index += 1
            # print(movie.name)
            returnRanking.append(movie)
            if len(returnRanking) >= count:
                break
        ratingsList.remove(value)       

    return returnRanking

def getGenreID(name):
    for id in lookUpTableGenres:
        if name == lookUpTableGenres[id]:
            return id
    return -1


Insert a num of max ranking and then add how many user rantings the movie must at least have and the genres

In [None]:
movieRanked = GetHighestRanking(10, 10, ['fantay' , 'harror', 'come'])

index = 1
for item in movieRanked:
    print(f"{index}. Titel: {item.name} | Users: {item.ratingCount} | Genres: {item.genres} | Rating {item.rating:.2f}")
    index += 1


**Adds the User tags to the Movies**

In [1]:

def create_movie_id_tags_lookup_table():
    tags = pd.read_csv('tags.csv')
    tags_lookup_table = {}
    movie_ids_set = set()
    movie_id = -1
    for index in range(len(tags)):

        if tags['movieId'][index] != movie_id and tags['movieId'][index] not in movie_ids_set:
            tags_lookup_table[tags['movieId'][index]] = []
            movie_id = tags['movieId'][index]
        movie_ids_set.add(movie_id)
        tags_lookup_table[tags['movieId'][index]].append(tags['tag'][index])

    return set(tags_lookup_table)


lookup_table_tags = create_movie_id_tags_lookup_table()

print(set(lookup_table_tags[260]))


NameError: name 'pd' is not defined

**Similarity between movies using Jaccard similarity with the User Tags**

In [None]:
def get_top_tags_jaccard_sims_with_user_tags(count, movies, query):


    smiliar_movies = {}
    sim_values = []
    for mID in movies:
        if movies[mID].name == query.name:
            continue
        if mID not in lookup_table_tags or query.id not in lookup_table_tags:
            continue

        sim = jaccard_sim(lookup_table_tags[mID], lookup_table_tags[query.id])
        sim_values.append(sim)
        smiliar_movies[sim] = movies[mID].name

    top_similar = {}

    sim_values.sort(reverse=True)

    for i in range(count):
        top_similar[sim_values[i]] = smiliar_movies[sim_values[i]]
    return top_similar

count = 10
query = movies[4896] #harry potter
similar_movies_with_jaccard = get_top_tags_jaccard_sims_with_user_tags(count, movies, query)
PrintSimilarity(similar_movies_with_jaccard)