RoadMap:
Heterogeneous Graph:
  Nodes:
    Actor:
      TotalMovies
      GenreDistribution
    Movie:
      TitleID
      Name
      Runtime
      Genre
      IsAdult
      ReleaseYear
      RuntimeMinutes
      Genres
      TotalRatings
      AvgRating
      Box Office Revenue
  Edges:
    Actor -> Movie (Acted in Movie)
    Actor -> Actor in Shared movies

In [2]:
! pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric
! pip install pyg-lib -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
! pip install beautifulsoup4
! pip install mediawikiapi

Collecting torch-scatter
  Using cached torch_scatter-2.1.2.tar.gz (108 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting torch-sparse
  Using cached torch_sparse-0.6.18.tar.gz (209 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting torch-cluster
  Using cached torch_cluster-1.6.3.tar.gz (54 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting torch-spline-conv
  Using cached torch_spline_conv-1.2.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting torch-geometric
  Downloading torch_geometric-2.5.2-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m514.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting scipy (from torch-sparse)
  Downloading scipy-1.13.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m552.3 kB/s[0m eta [36m0:00:00[0ma 

In [17]:
# Perform imports
from csv import reader as CSVReader
from csv import writer as CSVWriter
from operator import itemgetter as itemgetter
from mediawikiapi import MediaWikiAPI

In [3]:
# Trim based on minimum start year, "movie" classification and count genre frequencies

inpath = "data/title.basics.tsv"
outpath = "data/title.basics_trimmed.tsv"

minStartYear = 1980

genreFreq = {}
with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  i = 0
  j = 0
  for line in reader:
    l = list(line)
    startStr = str(l[5])
    type = str(l[1])
    genres = str(l[8]).split(sep=',')
    if ( (i != 0) and (startStr != "\\N") and (int(startStr) >= minStartYear) and type == "movie"):
      writer.writerow(line)
      for genre in genres:
        if genre != "\\N":
          if genre in genreFreq:
            genreFreq[genre] += 1
          else:
            genreFreq[genre] = 1
      j += 1
    i += 1

print(f"Copied {j} lines out of {i} ({round((j / i) * 100, 2)}%)")

genreFreqItems = genreFreq.items()
print(f"Genres ({len(genreFreqItems)}):")
for genre, freq in sorted(genreFreqItems, key=itemgetter(1), reverse=True):
  print(f"\t{genre}: {freq}")

del genreFreq

Copied 438832 lines out of 10822149 (4.05%)
Genres (26):
	Drama: 157544
	Documentary: 117247
	Comedy: 76134
	Action: 37525
	Thriller: 32705
	Romance: 31812
	Horror: 28477
	Crime: 24305
	Adventure: 17393
	Biography: 15654
	Family: 14569
	Mystery: 13438
	History: 11945
	Music: 11287
	Fantasy: 11047
	Sci-Fi: 9033
	Animation: 7906
	Adult: 6560
	Sport: 6310
	War: 4839
	Musical: 4576
	News: 1634
	Western: 1302
	Reality-TV: 493
	Talk-Show: 163
	Game-Show: 21


In [7]:
# Encode genre list for each movie
genreList = list(i[0] for i in genreFreqItems)

inpath = "data/title.basics_trimmed.tsv"
outpath = "data/title.basics_genres_encoded.tsv"

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    
    encodedGenres = ""
    genres = str(l[8]).split(sep=',')
    for genre in genreList:
      if genre in genres:
        encodedGenres += '1'
      else:
        encodedGenres += '0' 
    newRow = l[:8]
    newRow.append(encodedGenres)
    writer.writerow(newRow)

In [8]:
# Trim out duplicate references to same movie

inpath = "data/title.basics_genres_encoded.tsv"
outpath = "data/title_unique.tsv"

observedFeatures = set()

numDuplicates = 0

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    features = str(l[1:])
    if not features in observedFeatures:
      observedFeatures.add(features)
      writer.writerow(l)
    else:
      numDuplicates += 1    

del observedFeatures
print(f"Removed {numDuplicates} duplicates")

Removed 25975 duplicates


In [14]:
# Trim out unnecessary columns (titleType (1), originalTitle (2), endYear (6))

inpath = "data/title_unique.tsv"
outpath = "data/processed/movies.tsv"

observedFeatures = set()

numDuplicates = 0

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    writer.writerow([l[0], l[3], l[4], l[5], l[7], l[8]])

In [15]:
# Drop movies that do not contain a runtime length value

inpath = "data/processed/movies.tsv"
outpath = "data/processed/moviesRuntimes.tsv"

numRemoved = 0

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    if (l[4] != "\\N"):
      writer.writerow(l)
    else:
      numRemoved += 1

print(f"Removed {numRemoved} entries")

Removed 97840 entries


In [16]:
# Join movies with ratings data, discard those without ratings data

inpathMovies = "data/processed/moviesRuntimes.tsv"
inpathRatings = "data/title.ratings.tsv"
outpath = "data/processed/moviesRatings.tsv"

linesWritten = 0

with open(inpathMovies, "r") as infileMovies, open(inpathRatings, "r") as infileRatings, open(outpath, "w") as outfile:
  readerMovies = CSVReader(infileMovies, delimiter="\t", quotechar=None)
  readerRatings = CSVReader(infileRatings, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  ratingsLine = next(readerRatings)
  ratingsHeader = ratingsLine
  # print(ratingsLine)
  

  for line in readerMovies:
    l = list(line)

    while (ratingsLine[0] < l[0] and len(ratingsHeader) == len(ratingsLine)):
      # print(f"\"{ratingsLine[0]}\"", f"\"{l[0]}\"")
      ratingsLine = next(readerRatings)
    if (ratingsLine[0] == l[0]):
      l.append(ratingsLine[1])
      l.append(ratingsLine[2])
      writer.writerow(l)
      linesWritten += 1

print(f"Wrote {linesWritten} lines")

Wrote 205605 lines


In [21]:
mdwk = MediaWikiAPI()

res = mdwk.search("Carmencita")
print(res)

# pageTitle = str(res[0])
# print(pageTitle)

#print(mdwk.summary("Space_Jam"))

# titlestub = "https://en.wikipedia.org/api/rest_v1/page/title/"

# page = mdwk.page(title=pageTitle)

['Carmencita', 'Carmencita Padilla', 'Angaria carmencita', 'Carmencita Reyes', 'Carmencita Lara', 'Carmencita Hederman', 'Carmencita (film)', 'Carmencita Calderón', 'Carmen Martínez-Bordiú', 'Carmencita (Corinth)']
