RoadMap:
Heterogeneous Graph:
  Nodes:
    Actor:
      TotalMovies
      GenreDistribution
    Movie:
      TitleID
      Name
      Runtime
      Genre
      IsAdult
      ReleaseYear
      RuntimeMinutes
      Genres
      TotalRatings
      AvgRating
      Box Office Revenue
  Edges:
    Actor -> Movie (Acted in Movie)
    Actor -> Actor in Shared movies

In [59]:
! pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric
! pip install pyg-lib -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
! pip install beautifulsoup4
! pip install mediawikiapi

Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
[31mERROR: Could not find a version that satisfies the requirement pyg-lib (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pyg-lib[0m[31m


In [60]:
# Perform imports
from csv import reader as CSVReader
from csv import writer as CSVWriter
from operator import itemgetter as itemgetter
from mediawikiapi import MediaWikiAPI

In [61]:
# Trim based on minimum start year, "movie" classification and count genre frequencies

inpath = "data/title.basics.tsv"
outpath = "data/title.basics_trimmed.tsv"

minStartYear = 2000
maxStartYear = 2020

genreFreq = {}
with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  i = 0
  j = 0
  for line in reader:
    l = list(line)
    startStr = str(l[5])
    type = str(l[1])
    genres = str(l[8]).split(sep=',')
    if ( (i != 0) and (startStr != "\\N") and (int(startStr) >= minStartYear) and int(startStr) <= maxStartYear and type == "movie"):
      writer.writerow(line)
      for genre in genres:
        if genre != "\\N":
          if genre in genreFreq:
            genreFreq[genre] += 1
          else:
            genreFreq[genre] = 1
      j += 1
    i += 1

print(f"Copied {j} lines out of {i} ({round((j / i) * 100, 2)}%)")

genreFreqItems = genreFreq.items()
print(f"Genres ({len(genreFreqItems)}):")
for genre, freq in sorted(genreFreqItems, key=itemgetter(1), reverse=True):
  print(f"\t{genre}: {freq}")

del genreFreq

Copied 259763 lines out of 10716429 (2.42%)
Genres (26):
	Drama: 90686
	Documentary: 82723
	Comedy: 44836
	Action: 18864
	Thriller: 18573
	Romance: 18374
	Horror: 17214
	Crime: 12699
	Biography: 10592
	Adventure: 9856
	Family: 8743
	Mystery: 8020
	History: 8018
	Music: 7663
	Fantasy: 6249
	Sci-Fi: 5089
	Animation: 4670
	Sport: 4333
	Musical: 2648
	War: 2607
	Adult: 1463
	News: 1317
	Western: 728
	Reality-TV: 373
	Talk-Show: 113
	Game-Show: 10


In [62]:
# Encode genre list for each movie
genreList = list(i[0] for i in genreFreqItems)

inpath = "data/title.basics_trimmed.tsv"
outpath = "data/title.basics_genres_encoded.tsv"

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    
    encodedGenres = ""
    genres = str(l[8]).split(sep=',')
    for genre in genreList:
      if genre in genres:
        encodedGenres += '1'
      else:
        encodedGenres += '0' 
    newRow = l[:8]
    newRow.append(encodedGenres)
    writer.writerow(newRow)

In [63]:
# Trim out duplicate references to same movie

inpath = "data/title.basics_genres_encoded.tsv"
outpath = "data/title_unique.tsv"

observedFeatures = set()

numDuplicates = 0

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    features = str(l[1:])
    if not features in observedFeatures:
      observedFeatures.add(features)
      writer.writerow(l)
    else:
      numDuplicates += 1    

del observedFeatures
print(f"Removed {numDuplicates} duplicates")

Removed 168 duplicates


In [64]:
# Trim out unnecessary columns (titleType (1), originalTitle (2), isAdult (4), endYear (6))

inpath = "data/title_unique.tsv"
outpath = "data/processed/movies.tsv"

observedFeatures = set()

numDuplicates = 0

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    writer.writerow([l[0], l[3], l[5], l[7], l[8]])

In [65]:
# Drop movies that do not contain a runtime length value

inpath = "data/processed/movies.tsv"
outpath = "data/processed/moviesRuntimes.tsv"

numRemoved = 0

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    if (l[3] != "\\N"):
      writer.writerow(l)
    else:
      numRemoved += 1

print(f"Removed {numRemoved} entries")

Removed 52939 entries


In [66]:
# Join movies with ratings data, discard those without ratings data or with less than minRatingsCount total ratings

inpathMovies = "data/processed/moviesRuntimes.tsv"
inpathRatings = "data/title.ratings.tsv"
outpath = "data/processed/moviesRatings.tsv"

minRatingsCount = 10000

linesWritten = 0

with open(inpathMovies, "r") as infileMovies, open(inpathRatings, "r") as infileRatings, open(outpath, "w") as outfile:
  readerMovies = CSVReader(infileMovies, delimiter="\t", quotechar=None)
  readerRatings = CSVReader(infileRatings, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  ratingsLine = next(readerRatings)
  ratingsHeader = ratingsLine
  # print(ratingsLine)
  

  for line in readerMovies:
    l = list(line)

    while (ratingsLine[0] < l[0] and len(ratingsHeader) == len(ratingsLine)):
      # print(f"\"{ratingsLine[0]}\"", f"\"{l[0]}\"")
      ratingsLine = next(readerRatings)
    if (ratingsLine[0] == l[0] and int(ratingsLine[2]) >= minRatingsCount):
      l.append(ratingsLine[1])
      l.append(ratingsLine[2])
      writer.writerow(l)
      linesWritten += 1

print(f"Wrote {linesWritten} lines")

Wrote 6560 lines


In [71]:
# Create final movies files
inpath="data/processed/moviesRatings.tsv"

outpathData="data/final/movies.data.tsv"
outpathLabels="data/final/movies.labels.tsv"

with open(inpath, "r") as infile, open(outpathData, "w") as outfileData, open(outpathLabels, "w") as outfileLabels:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writerData = CSVWriter(outfileData, delimiter="\t")
  writerLabels = CSVWriter(outfileLabels, delimiter="\t")
  
  writerLabels.writerow(["TitleID", "TitleName"])
  writerData.writerow(["TitleID", "ReleaseYear", "RuntimeMinutes", "AvgRatings", "NumRatings", "Genres"])

  for line in reader:
    l = list(line)
    print(l)
    writerLabels.writerow([l[0], l[1]])
    writerData.writerow([l[0], l[2], l[3], l[5], l[6], l[4]])

['tt0035423', 'Kate & Leopold', '2001', '118', '00111000000000000000000000', '6.4', '89039']
['tt0118589', 'Glitter', '2001', '104', '00001100000000100000000000', '2.4', '24085']
['tt0118694', 'Fa yeung nin wah', '2000', '98', '00001100000000000000000000', '8.1', '166670']
['tt0120202', 'State and Main', '2000', '105', '00100100000000000000000000', '6.7', '22167']
['tt0120263', 'Sånger från andra våningen', '2000', '98', '00100100000000000000000000', '7.5', '20334']
['tt0120630', 'Chicken Run', '2000', '84', '00100000000100010000000000', '7.1', '212160']
['tt0120667', 'Fantastic Four', '2005', '106', '10010000000100000000000000', '5.7', '343663']
['tt0120679', 'Frida', '2002', '123', '00001100000001000000000000', '7.3', '95262']
['tt0120681', 'From Hell', '2001', '122', '00000010011000000000000000', '6.7', '164140']
['tt0120737', 'The Lord of the Rings: The Fellowship of the Ring', '2001', '178', '10000100000100000000000000', '8.9', '2005107']
['tt0120753', 'The Million Dollar Hotel', 

In [None]:
# Gather relevant entreis from "names" data
inpath="data/name.basics.tsv"