In [1]:
import csv
import io
import itertools

#Movie atributes
MOVIE_TITLE = 11
DIRECTOR_NAME = 1
TITLE_YEAR = 23
DURATION = 3
GROSS = 8 
GENRES = 9
PLOT_KEYWORDS = 16
LENGUAGE = 19
CONTENT_RATING = 21
BUDGET = 22
COUNTRY = 20
ASPECT_RATIO = 26
COLOR = 0
MOVIE_IMBD_LINK = 17
IMBD_SCORE = 25
NUM_CRITIC_FOR_REVS = 2
NUM_USERS_FOR_REVIEWS = 18
NUM_VOTED_USERS = 12
FACENUMBER_IN_POSTER = 15
DIR_FB_LIKES = 4
CAST_FB_LIKES = 13
MOVIE_FB_LIKES =27

#Actor 1 atributes
ACT1_NAME = 10
ACT1_FB_LIKES = 7

#Actor 2 atributes
ACT2_NAME = 6
ACT2_FB_LIKES = 24

#Actor 3 atributes
ACT3_NAME = 14
ACT3_FB_LIKES = 5

#Names of the csv's we're creating
MOVIE_MOVIE = "movie-movie-edges.csv"
ACTOR_ACTOR = "actor-actor-edges.csv"
MOVIES_ATR = "movie-attributes.csv"
ACTR_ATR = "actor-attributes.csv"

In [2]:
class Movie:
    """
    Class to store all the information each Movie has.
    :name: Name of the movie. Str
    :movieAtributes: Dictionary with the movie Atr's. [str] = str.
    :actors: List of the actor's name performing in the movie. Str
    :director: Name of the director of the movie. Str
    """
    def __init__(self,name,director):
        self.name = name
        self.movieAtributes = {}                                                    
        self.actors = []
        self.director = director

    def add_actors(self,actors):
        """
        Function that adds actors to the movie
        :actors: List of strings with the names of the actor on it
        """
        for actor in actors:
            self.actors.append(actor)
    
    def add_movieAtribute(self,key,atribute):
        """
        Function that adds an Atribute to the movie
        :key: The key that goes to the dictionary. Str
        :atribute: The value of the key on the dictionary. Str
        """
        self.movieAtributes[key] = atribute

In [3]:
class Actor:
    """
    Class to store all the information each Actor has.
    :name: Name of the movie. Str
    :fbLikes: Number of fb likes. Str
    """
    def __init__(self,name,fbLikes):
        self.name = name
        self.fbLikes = fbLikes
        self.movies = [] #Has the names of the movies he/she worked in. List of str's
        self.genres = {} #Has all the genres of the films he/she worked with . Dict of [str] = int
        self.actorGenre = None #Most dominant genre among the films he/she worked in

    def add_movie(self,movie,genres):
        """
        Adds a movie to the actor
        :movie: The movie name we are adding. Str
        genres: The genres of the NEW film, Str with the format genre1|genre2|...|genren
        """
        self.movies.append(movie)
        self.actorGenre = self.dominatingGenre(genres)

    def dominatingGenre (self,genres):
        """
        It calculates which genre is the most dominating among the films added on him every time we add a new film to the actor
        :genres: The genres of the NEW film, Str with the format genre1|genre2|...|genren
        :return: The genre most common in ALL the films he worked with.
        """
        genres = genres.split("|")
        for genre in genres:
            if genre not in self.genres.keys():
                #If the genre is not in the dict, we add it and we start the number of genre appreances as 1
               self.genres[genre] = 1
            else:
                #If it's already there, we up the appearences count by one
                self.genres[genre] += 1
        

        #¿Which is the most common? The one with a higher value
        maxim = 0

        for genre in self.genres.keys():
            if self.genres[genre] > maxim:
                maxim = self.genres[genre] #Maxim is a number!
                commonGen = genre #CommonGen is a str
        return commonGen

In [4]:
def thingsinCommon(llista1, llista2):
    """
    It calculates the number of elements in commmon that have two lists.
    :llista1,llista2: list of strings
    :return: length of the intersection of both lists.
    """
    return(len(list(set(llista1) & set(llista2))))

def rawText(text):
    """
    Class to trim the input into readable raw text. Blank spaces and special characters get eliminated
    :text: a string to return as raw text
    :return: the same text ready to be readed by our algoritmhs
    """
    text =text.strip()
    try:
        if text[-1] == "\xa0": #Recursive char in the DB
           rawText = text[:-1]
        else:
            rawText = text
    except:
        rawText = text

    return rawText

def nextActor(actorName,fbLikes,movieName,movieGenres,dictofActors):
    """
    If the Actor already exists, we add the movie to his career. If it doesn't, we create the instance and add him to the dictofActors first.
    :actorName: The name if the actor. Str
    :movieName: The name of the movie. Str
    :fbLikes: The number of fb likes the actor has. Str
    :dictofActors: A dict with the actos. [nameActor(str)] = Actor(Actor instance)
    :genres: The genres of the film, Str with the format genre 1|genre 2|...|genre n
    """
    
    if actorName in dictofActors.keys():
        #Actor already exists? Add the film to his movies
        dictofActors[actorName].add_movie(movieName,movieGenres)

    elif actorName not in dictofActors.keys():
        #Actor does not exist? We create him, we add him into the dicofActors and we add his first movie
        actor = Actor(actorName,fbLikes)
        actor.add_movie(movieName,movieGenres)
        dictofActors[actor.name] = actor

In [5]:
def read_csv(archivocsv):
    """
    Reads the movie_metadata and creates vitual objects from where we'll create the 4 csv's. 
    :archivocsv: The path to the metadata csv
    :return: dict of movies = a list with 
    """
    with open(archivocsv,"r") as csvfile:
        dictofMovies = {} #Stores Movie objects. Keys are names, items are Movies objects
        dictofActors = {} #Stores Actor objects. Keys are names, items are Actors objects
        read = csv.reader(csvfile, delimiter = ',') #Creating the reader of the csv
        csvHeadings = next(read) #First line is the Header. Used for the names of things
        movieAttributes = [TITLE_YEAR,         #List of Atrs related to movies on each line
                        DURATION,
                        GROSS,
                        GENRES,
                        PLOT_KEYWORDS,
                        LENGUAGE,
                        CONTENT_RATING,
                        BUDGET,
                        COUNTRY,
                        ASPECT_RATIO,
                        COLOR,
                        MOVIE_IMBD_LINK,
                        IMBD_SCORE,
                        NUM_CRITIC_FOR_REVS,
                        NUM_USERS_FOR_REVIEWS,
                        NUM_VOTED_USERS,
                        FACENUMBER_IN_POSTER,
                        DIR_FB_LIKES,
                        CAST_FB_LIKES,
                        MOVIE_FB_LIKES
                        ]           

        for line in read:
            if (rawText(line[MOVIE_TITLE]) not in dictofMovies.keys() and line[MOVIE_TITLE] != ""): #No reps allowed!
                #We start creating the object Movie...
                currentMovie = Movie(rawText(line[MOVIE_TITLE]),rawText(line[DIRECTOR_NAME]))

                #...And adding its attributes
                for atribute in movieAttributes:
                    currentMovie.add_movieAtribute(csvHeadings[atribute],rawText(line[atribute]))

                #For the attribute genres (only in the MOVIE CLASS), we only take the first one (We delete the others!)
                currentMovie.movieAtributes[csvHeadings[GENRES]] = currentMovie.movieAtributes[csvHeadings[GENRES]].split("|")
                currentMovie.movieAtributes[csvHeadings[GENRES]] = currentMovie.movieAtributes[csvHeadings[GENRES]][0]

                #Then add its actors
                actorsinMovie = []

                if (rawText(line[ACT1_NAME]) != ""): #Careful with blank spaces on the DB!
                    nextActor(line[ACT1_NAME],rawText(line[ACT1_FB_LIKES]),currentMovie.name,line[GENRES],dictofActors)
                    #We append only the names of the actors, not the object itself
                    actorsinMovie.append(line[ACT1_NAME])

                if (rawText(line[ACT2_NAME]) != ""):
                    nextActor(line[ACT2_NAME],rawText(line[ACT2_FB_LIKES]),currentMovie.name,line[GENRES],dictofActors)
                    actorsinMovie.append(line[ACT2_NAME])

                if (rawText(line[ACT3_NAME]) != ""):
                    nextActor(line[ACT3_NAME],rawText(line[ACT3_FB_LIKES]),currentMovie.name,line[GENRES],dictofActors)
                    actorsinMovie.append(line[ACT3_NAME])

                currentMovie.add_actors(actorsinMovie) #Actors to the movie
                dictofMovies[currentMovie.name] = currentMovie #Movie to the list of Movies
        
    return dictofMovies,dictofActors #Returning the list and the dict

pelis,actors = read_csv("movie_metadata.csv")
print("DataBase readed succesfully")


DataBase readed succesfully


In [6]:
def movieMovie(movies):
    """
    It generates the csv file containing the matches among movies that share the same director or two or more actors. 
    :movies: dictionary with the form of [name(str)] = movie (instance of movie object)
    :return: nothing
    """
    with io.open(MOVIE_MOVIE, "w") as fileout:
        writer = csv.writer(fileout, delimiter='\t', quotechar='"')
        writer.writerow(["movie1", "movie2", "class"])

        listaaux = [] #we create an auxiliary list to iterate it in order to find coincidences among its elements. 

        for movie in movies.keys():
                listaaux.append(movies[movie])

        for i in range(len(listaaux)-1): #i goes from position 0 to the position before the last one. 
            if listaaux[i].director != "": #We control the blank spaces
                for j in range(i+1,len(listaaux)): #j starts from i+1 and goes to the last pos

                    if (listaaux[i].director==listaaux[j].director or thingsinCommon(listaaux[i].actors,listaaux[j].actors)>=2):
                        #If we enter here, then the films share wether the director or at least two actors.
                        value1 = listaaux[i].name
                        value2 = listaaux[j].name

                        if listaaux[i].director!=listaaux[j].director: #if the movies don't share the director, they have actors in common. 
                            value3 = " %d actors in common" % (thingsinCommon(listaaux[i].actors,listaaux[j].actors))

                        elif thingsinCommon(listaaux[i].actors,listaaux[j].actors)<2: #if they have less than 2 actors in common, they have the same director.
                            value3 = "director in common"  

                        else:
                            value3 ="director and %d actors in common" % (thingsinCommon(listaaux[i].actors,listaaux[j].actors)) #or both things...
                        
                        writer.writerow([value1, value2, value3])

movieMovie(pelis)
print("Movie-Movie graph created succesfully")

Movie-Movie graph created succesfully


In [7]:
def actorActor(actors):
    """
    It generates the csv file for the actor-actor graph. If two actors appear in the same film, we add them to the file. 
    :actors: dictionary with the form of [name(str)] = actor (instance of actor object)
    :return: nothing.
    """
    with io.open(ACTOR_ACTOR, "w") as fileout:
        writer = csv.writer(fileout, delimiter='\t', quotechar='"')
        writer.writerow(["actor1", "actor2", "class"])
        
        for k1, k2 in itertools.combinations(actors, 2):#Another way to check for matches in a dictionary is using this combionations method
            if (thingsinCommon(actors[k1].movies,actors[k2].movies)>0): 
                #If we enter here then the actors share a movie
                value1 = k1
                value2 = k2
                value3 = thingsinCommon(actors[k1].movies,actors[k2].movies) #Their class will be the number of films they share in common

                writer.writerow([value1, value2, value3])
actorActor(actors)
print("Actor-Actor graph created succesfully")

Actor-Actor graph created succesfully


In [8]:
def moviesAtr(movies):
    """
    It generates a csv file with each movie and its atributtes . 
    :movies: dictionary with the form of [name(str)] = movie (instance of movie object):
    :return: nothing
    """
    with io.open(MOVIES_ATR, "w") as fileout:
            writer = csv.writer(fileout, delimiter='\t', quotechar='"')
            headings = []
            headings.append("Movie")

            for movie in movies.keys():
                #We enter to the first movie to retrieve the keys, that will be used as headings of the csv.
                for atribute in movies[movie].movieAtributes.keys():
                    headings.append(atribute)
                break #We break the loop because we only need one movie to know the headings
            
            writer.writerow(headings)

            #Now we are ready to iterate each movie
            for movie in movies.keys():
                atributesList = []
                atributesList.append(movie)

                for atr in movies[movie].movieAtributes.keys():
                    atributesList.append(movies[movie].movieAtributes[atr])

                writer.writerow(atributesList)
                
moviesAtr(pelis)
print("Movies attributes graph created succesfully")

Movies attributes graph created succesfully


In [9]:
def actorsAtr(actors):
    """
    It generates a csv file with each actor and its atributtes . 
    :movies: dictionary with the form of [name(str)] = actor (instance of actor object):
    :return: nothing
    """
    with io.open(ACTR_ATR, "w") as fileout:
            writer = csv.writer(fileout, delimiter='\t', quotechar='"')
            writer.writerow(["Actor","Facebook likes","Actor Genre"]) # The headings of the csv
            for actor in actors.keys():
                #We print on the csv the actor name, its fb likes and its genre.
                writer.writerow([actor,actors[actor].fbLikes,actors[actor].actorGenre])
                
actorsAtr(actors)
print("Actors attributed graph created succesfully")

Actors attributed graph created succesfully
