# Advanced Information Retrieval - Project

### Organizational infos:
* 30 Points in total
* Deadline: 09.01.2023 23:59

In [5]:
#imports

import pandas as pd
import numpy as np
import random
import ast
import gensim
from torch import nn
import torch
from nltk.tokenize import WhitespaceTokenizer
from scipy.stats import pearsonr
from thefuzz import fuzz
from thefuzz import process



## Movie struct
Create a struct that contains all  the movie relevant Information to work with.
Could be changed at any given time.


**Define all relevante class attributes**

In [6]:
class Movie:
    def __init__(self, id, name, genres):
        self.id = id
        self.name = name
        self.genres = genres
        self.tagsRelevance = [0] * 1128
        self.tagsList = []
        self.ratingCount = 0
        self.ratingSum = 0
        self.rating = 0

    def SetTagRelevanceVector(self, tagsRelevance):
        self.tagsRelevance = tagsRelevance

    def SetTagsList(self, tagsList):
        self.tagsList = tagsList
    
    def SetSumRatings(self, userRatings):
        self.rating = userRatings
    
    def AddRatingCount(self, userRating):
        self.ratingCount += 1
        self.ratingSum += userRating
        self.rating = self.ratingSum / self.ratingCount
    

**Create A List of Movies and a look up Table for the Genres for later usage.**

The Genres could also be used for a similarity measurement so i just saved the just in case.
Also i am not to sure if genre-14 (Imax) should really be inluded. 

In [7]:

def CreateMovieList(lookUpGenre):
    allMovies = {}
    moviedocs = pd.read_csv('movies.csv')
    lookUpIndex = 1

    for i in range(len(moviedocs)):
        movieGenres = moviedocs['genres'][i]
        movieGenresList = movieGenres.split('|')
        movieGenresNumericalList = []
        for word in movieGenresList:
            if word not in lookUpGenre:
                lookUpGenre[word] = lookUpIndex
                lookUpIndex += 1

            movieGenresNumericalList.append(lookUpGenre[word])
        movie = Movie(moviedocs['movieId'][i], moviedocs['title'][i], movieGenresNumericalList)

        allMovies[movie.id] = movie
    return allMovies
    

**Adds the Genom Score relevances to the Movies**

The genom score is the relevances the tags have to the movies. We also have a csv file mapping the tags to an index.
So i created a tags look up table.

In [8]:
def CreateTagsLookUpTable():
    genres = pd.read_csv('genome-tags.csv')
    tagsLookUpTable = {}
    for index in range(len(genres)):
        tagsLookUpTable[genres['tagId'][index]] = genres['tag'][index]
        index += 1
    return tagsLookUpTable

tags_lookup_table = CreateTagsLookUpTable()


def AddGenoScoresToMovies(movieList):
    genomeScores = pd.read_csv('genome-scores.csv')
    movieTagsRelevance = []
    tags = []
    movieId = genomeScores['movieId'][0]
    for i in range(len(genomeScores)):
        if genomeScores['movieId'][i] != movieId:
            movieList[movieId].SetTagRelevanceVector(movieTagsRelevance.copy())
            movieList[movieId].SetTagsList(tags.copy())
            movieId = genomeScores['movieId'][i]
            movieTagsRelevance.clear()
            tags.clear()

        tags.append(tags_lookup_table[genomeScores["tagId"][i]])
        movieTagsRelevance.append(genomeScores['relevance'][i])
    movieList[movieId].SetTagRelevanceVector(movieTagsRelevance)
    return


genomeScores = pd.read_csv('genome-scores.csv')
print(genomeScores['movieId'][0])
print(genomeScores['relevance'][0])

1
0.0287499999999999


**Using all the Functions above**


well it's not takes a lot of time to add all the relevance scores...

In [9]:
lookUpTableGenres = {}
movies = CreateMovieList(lookUpTableGenres)
AddGenoScoresToMovies(movies)
lookUpTags = CreateTagsLookUpTable()



**Use the tags as a similarity measurement vector.**

Get a sorted List with a given size depending on the count. Uses the Cosine similiarity or Jaccard similarity to calculate the tags similarity between movies.

In [10]:
print(movies[1].tagsList)
def calc_cosine(movieOne, movieTwo):

    d_1 = 0
    d_2 = 0
    numerator = 0
    for index in range(len(movieOne)):
        numerator += movieOne[index] * movieTwo[index]
        d_1 += movieOne[index] * movieOne[index]
        d_2 += movieTwo[index] * movieTwo[index]

    denominator = np.sqrt(d_1) * np.sqrt(d_2)
    res = numerator / denominator
    return res


def jaccard_sim(movie1,movie2):
    number_both=set(movie1).intersection(set(movie2))
    number_oberservations_in_either = len(movie1)+len(movie2)-len(number_both)
    similarity= float(len(number_both)) / number_oberservations_in_either
    return similarity


def GetTopTagsCosineSims(count, movies, query):

    #empty
    if query.tagsRelevance.count(0) == 1128:
        return -1
        
    smiliarMovies = {}
    simValues = []
    for mID in movies:
        if movies[mID].name == query.name or movies[mID].tagsRelevance.count(0) == 1128:
            continue
        sim = calc_cosine(movies[mID].tagsRelevance, query.tagsRelevance)
        simValues.append(sim)
        smiliarMovies[sim] = movies[mID].name

    TopSimilar = {}


    for i in range(count):
        v = max(simValues)
        TopSimilar[v] = smiliarMovies[v]
        simValues.remove(v)
    return TopSimilar

def get_top_tags_jaccard_sims(count, movies, query):

    if query.tagsRelevance.count(0) == 1128:
        return -1

    smiliar_movies = {}
    sim_values = []
    for mID in movies:
        if movies[mID].name == query.name or movies[mID].tagsRelevance.count(0) == 1128:
            continue
        sim = jaccard_sim(movies[mID].tagsRelevance, query.tagsRelevance)
        sim_values.append(sim)
        smiliar_movies[sim] = movies[mID].name

    top_similar = {}

    sim_values.sort(reverse=True)

    for i in range(count):
        top_similar[sim_values[i]] = smiliar_movies[sim_values[i]]
    return top_similar


def PrintSimilarity(similarMovies):
    if similarMovies != -1:
        #print("Query: ", query.name)
        print("similar Movies: ")
        count = 1
        for sim in similarMovies:
            print(count, similarMovies[sim] + ":", sim)
            count += 1
    else:
        print("Query deosn't have tags!")


['007', '007 (series)', '18th century', '1920s', '1930s', '1950s', '1960s', '1970s', '1980s', '19th century', '3d', '70mm', '80s', '9/11', 'aardman', 'aardman studios', 'abortion', 'absurd', 'action', 'action packed', 'adaptation', 'adapted from:book', 'adapted from:comic', 'adapted from:game', 'addiction', 'adolescence', 'adoption', 'adultery', 'adventure', 'affectionate', 'afi 100', 'afi 100 (laughs)', 'afi 100 (movie quotes)', 'africa', 'afterlife', 'aging', 'aids', 'airplane', 'airport', 'alaska', 'alcatraz', 'alcoholism', 'alien', 'alien invasion', 'aliens', 'allegory', 'almodovar', 'alone in the world', 'alter ego', 'alternate endings', 'alternate history', 'alternate reality', 'alternate universe', 'amazing cinematography', 'amazing photography', 'american civil war', 'amnesia', 'amy smart', 'android(s)/cyborg(s)', 'androids', 'animal movie', 'animals', 'animated', 'animation', 'anime', 'antarctica', 'anti-hero', 'anti-semitism', 'anti-war', 'apocalypse', 'archaeology', 'argenti

Get a sorted List with a given size depending on the count. Uses the Pearson Correlation Coefficient to calculate the tags similarity between movies.

In [11]:
def calcPearsonCoefficient(movie, query):
    correlation, _ = pearsonr(movie, query)
    return correlation

def GetTopTagsPearsonCorrelation(count, movies, query):

    if query.tagsRelevance.count(0) == 1128:
        return -1

    similarity = dict()
    for mID in movies:
        if movies[mID].name == query.name or movies[mID].tagsRelevance.count(0) == 1128:
            continue
        sim = calcPearsonCoefficient(movies[mID].tagsRelevance, query.tagsRelevance)
        similarity[movies[mID].name] = sim

    top_movies = dict()
    for key in sorted(similarity, key=similarity.get, reverse=True)[:count]:
        top_movies[key] = similarity[key]

    return top_movies

def PrintPearsonCorrelation(similarMovies):
    if similarMovies != -1:
        print("\nQuery: ", query.name)
        print("similar Movies according to Pearson Correlation Coefficient: ")
        for idx, movie_key in enumerate(similarMovies):
            print(idx + 1, str(movie_key) + " : " + str(similarMovies[movie_key]))
    else:
        print("Query deosn't have tags!")

**Create a query and get/prints the movies with similar tags**

Uses the functions above to print the query result.

In [12]:
query = movies[4896]
count = 10

similarMovies = GetTopTagsCosineSims(count, movies, query)
PrintSimilarity(similarMovies)

similar_movies_with_jaccard = get_top_tags_jaccard_sims(count, movies, query)
PrintSimilarity(similar_movies_with_jaccard)

similarPearsonMovies = GetTopTagsPearsonCorrelation(count, movies, query)
PrintPearsonCorrelation(similarPearsonMovies)


Query:  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
similar Movies: 
1 Harry Potter and the Chamber of Secrets (2002): 0.977978451828736
2 Harry Potter and the Goblet of Fire (2005): 0.9697791461817978
3 Harry Potter and the Prisoner of Azkaban (2004): 0.9688240598617508
4 Harry Potter and the Order of the Phoenix (2007): 0.9637341256626571
5 Harry Potter and the Deathly Hallows: Part 1 (2010): 0.9437881725876737
6 Harry Potter and the Half-Blood Prince (2009): 0.9437512850141854
7 Harry Potter and the Deathly Hallows: Part 2 (2011): 0.9359784057024187
8 Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005): 0.8976148003424176
9 Spiderwick Chronicles, The (2008): 0.8844944457999436
10 Chronicles of Narnia: Prince Caspian, The (2008): 0.8802772014082358
Query:  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
similar Movies: 
1 Bird Box (2018): 0.2
2 The Hustle (2019): 0.199

# Fuzzy Title Matching - Levenshtein

Fuzzy matches the given title against all movie titles using the levenshtein distance

In [13]:
def fuzzyMatch(movies, user_title):
    movie_titles = [movies[mID].name for mID in movies]
    return process.extractOne(user_title, movie_titles, scorer=fuzz.partial_ratio)

matched_title = fuzzyMatch(movies, "Serch fo Spick")
print(matched_title)

('Star Trek III: The Search for Spock (1984)', 79)


# User Struct

A struct that contains the user ratings and also the tags the user choosed for a movie.

In [14]:
class User:
    def __init__(self, id):
        self.id = id
        self.movieRatings = {}
        self.movieTags = {}
        self.genomMovieTags = {}
    
    def AddRating(self, movie, rating):
        self.movieRatings[movie] = rating
    
    def AddTags(self, movie, tag):
        if movie not in self.movieTags:
            self.movieTags[movie] = []
        self.movieTags[movie].append(tag)

    def UpdateGenomTags(self, lookUpTableGenom):
        for movie in self.movieTags:
            inside = list()
            for tag in self.movieTags[movie]:
                for values in lookUpTags:
                    if lookUpTags[values] == tag:
                        inside.append(values)
                        
            self.genomMovieTags[movie] = inside


**Creates a list in User with every movie ranked by the User the calculates the generel movie ratings**

takes 5 minutes...

In [15]:
# userId,movieId,rating,timestamp
def CreateUsers(allMovies):
    ratings = pd.read_csv('ratings.csv')
    UserDic = {}
    userId = -1

    for i in range(len(ratings)):
        movie = allMovies[ratings['movieId'][i]]
        userID = ratings['userId'][i]
        ratingsRow = ratings['rating'][i]

        if userID != userId:
            userId = userID
            newUser = User(userId)      
            UserDic[newUser.id] = newUser

        newUser.AddRating(movie, ratingsRow)
        movie.AddRatingCount(ratingsRow)
    return UserDic

users = CreateUsers(movies)


In [16]:
# 7045,4896,4.0
print(movies[4896].name)
print("Movie Rating:", movies[4896].rating)
print("Random User Rating: ", users[7045].movieRatings[movies[4896]])


Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Movie Rating: 3.678158491145853
Random User Rating:  4.0


**Addind tags that user rated on movies (not every rated movie has a tag vice versa)**

In [17]:
#userId,movieId,tag,timestamp
def UserDefinedTag(allUsers, movies):
    userTags = pd.read_csv('tags.csv')

    activeUser = allUsers[userTags['userId'][0]]
    
    movie = movies[userTags['movieId'][0]]
    for i in range(len(userTags)):
        if activeUser.id != userTags['userId'][i]:
            activeUser = allUsers[userTags['userId'][i]]

        if movie.id != userTags['movieId'][i]:
            movie = movies[userTags['movieId'][i]]

        activeUser.AddTags(movie, userTags['tag'][i])
       
UserDefinedTag(users, movies) 


**Adding the Genom tags in numerical form**

In [18]:
def SetGenomTags(allUsers):
    for userId in allUsers:
        allUsers[userId].UpdateGenomTags(lookUpTags)

SetGenomTags(users)

# !!! Important !!!

**There are some Userdefined tags that arent in the 1128 tags** Example [Hayao Miyazaki,Miyazaki]

Should we create a new Table for all Tags? Or should we just change them into numbers and only work with the 1128 table?

In [19]:
# print("Movie:", movies[7099].name)

def CheckIfInLoopUpTable(genresToCheck):
    inside = list()
    keys = []
    for item in genresToCheck:
        for values in lookUpTags:
            if lookUpTags[values] == item:
                inside.append(item)
                keys.append(values)

    outside = list(set(genresToCheck) - set(inside))
    print("Genres:", genresToCheck)
    print("Inside:", inside)
    print("Inside (numeric):", keys)
    print("Not inside:", outside)
    return inside

CheckIfInLoopUpTable(list(users[19].movieTags[movies[7099]]))
print("Genom Tags(inside):", users[19].genomMovieTags[movies[7099]])

Genres: ['adventure', 'anime', 'ecology', 'fantasy', 'Hayao Miyazaki', 'Miyazaki', 'post-apocalyptic']
Inside: ['adventure', 'anime', 'ecology', 'fantasy', 'post-apocalyptic']
Inside (numeric): [29, 65, 342, 377, 803]
Not inside: ['Miyazaki', 'Hayao Miyazaki']
Genom Tags(inside): [29, 65, 342, 377, 803]


# TODOS
* search depending on tags
* search depending on title similarity with than searches for a similar movie
* seach query for getting ranking based best movie in specific genre (There are 19 genres (readme)!!)
* beim ML teil können wir die filme die ein User gerated hat und ein genom tag haben aufteilen in test und training und predicten wie der score sein soll
* jaccard similarity machen mit tags (brauchen die tags zu den einzelnen filmen) und dann 0 oder 1.
* Powerpoint presentation machen

Tags class and lookup table for tag name and id

In [20]:

class Tags:
    def __init__(self, tag_id, tag_name):
        self.tags_relevance = [0] * 1128
        self.tag_id = tag_id
        self.tag_name = tag_name

    def set_tag_relevance_vector(self, tags_relevance):
        self.tags_relevance = tags_relevance



def create_tag_list():
    tags_dict = {}
    genome_tags = pd.read_csv('genome-tags.csv')

    for i in range(len(genome_tags)):

        tag = Tags(genome_tags['tagId'][i], genome_tags['tag'][i])
        tags_dict[tag.tag_name] = tag

    return tags_dict


def create_tags_id_lookup_table():
    genres = pd.read_csv('genome-tags.csv')
    tags_lookup_table = {}
    for index in range(len(genres)):
        tags_lookup_table[genres['tag'][index]] = genres['tagId'][index]
        index += 1
    return tags_lookup_table


lookup_table_tags = create_tags_id_lookup_table()
tags_list = create_tag_list()




In [26]:
def user_tag_search(tags_test, movies, count):

    tags_counter_vector = [0] * 1128

    for tag in tags_test:
        tags_counter_vector[lookup_table_tags[tag]] = 1

    similar_movies = {}
    sim_values = []
    for mID in movies:
        if movies[mID].tagsRelevance.count(0) == 1128:
            continue
        sim = calc_cosine(movies[mID].tagsRelevance, tags_counter_vector)
        sim_values.append(sim)
        similar_movies[sim] = movies[mID].name

    top_similar = {}
    sim_values.sort(reverse=True)

    for i in range(count):
        top_similar[sim_values[i]] = similar_movies[sim_values[i]]
    return top_similar



{0.19079710782699305: 'Chinese Zodiac (Armour of God III) (CZ12) (2012)', 0.19050150367796725: 'Drop Zone (1994)', 0.18538631781912157: 'Sudden Death (1995)', 0.18329967839226147: 'Braddock: Missing in Action III (1988)', 0.1801669051334233: 'Yes, Madam (a.k.a. Police Assassins) (a.k.a. In the Line of Duty 2) (Huang gu shi jie) (1985)', 0.17750807765539783: 'Under Siege 2: Dark Territory (1995)', 0.17698470226297763: 'Passenger 57 (1992)', 0.17448684498089345: '12 Rounds (2009)', 0.1735743151350222: 'Terminal Velocity (1994)', 0.17329838519968369: 'Momentum (2015)'}


User can use the function user_tag_search to insert genome tags and get movie recommendations as a result

In [33]:

user_tag_result = user_tag_search(["action", "action packed"], movies, 10)
PrintSimilarity(user_tag_result)

Query:  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
similar Movies: 
1 Chinese Zodiac (Armour of God III) (CZ12) (2012): 0.18733320883806595
2 Tom and Huck (1995): 0.17766026440448682
3 Patriot, The (1998): 0.1752875931207842
4 Three Musketeers, The (2011): 0.17522993506272752
5 Punisher, The (1989): 0.17487741593470277
6 Drop Zone (1994): 0.17060262204576068
7 Braddock: Missing in Action III (1988): 0.16947919499357886
8 Pinocchio (2002): 0.16857685215785298
9 Yes, Madam (a.k.a. Police Assassins) (a.k.a. In the Line of Duty 2) (Huang gu shi jie) (1985): 0.16717386334646753
10 Three Musketeers, The (1993): 0.16694866466481667
