In [1]:
! pip install torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib torch-geometric
! pip install beautifulsoup4
! pip install mediawikiapi

Collecting torch-scatter
  Using cached torch_scatter-2.1.2.tar.gz (108 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting torch-sparse
  Downloading torch_sparse-0.6.18.tar.gz (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.0/210.0 kB[0m [31m775.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting torch-cluster
  Downloading torch_cluster-1.6.3.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m479.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting torch-spline-conv
  Downloading torch_spline_conv-1.2.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25h[31mERROR: Could not find a version that satisfies the requirement pyg-lib (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pyg-lib[0m[31m


In [3]:
from csv import reader as CSVReader
from csv import writer as CSVWriter
from operator import itemgetter as itemgetter

# Trim based on minimum start year, "movie" classification and count genre frequencies

inpath = "data/title.basics.tsv"
outpath = "data/title.basics_trimmed.tsv"

minStartYear = 1980

genreFreq = {}
with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  i = 0
  j = 0
  for line in reader:
    l = list(line)
    startStr = str(l[5])
    type = str(l[1])
    genres = str(l[8]).split(sep=',')
    if ( (i != 0) and (startStr != "\\N") and (int(startStr) >= minStartYear) and type == "movie"):
      writer.writerow(line)
      for genre in genres:
        if genre != "\\N":
          if genre in genreFreq:
            genreFreq[genre] += 1
          else:
            genreFreq[genre] = 1
      j += 1
    i += 1

print(f"Copied {j} lines out of {i} ({round((j / i) * 100, 2)}%)")

genreFreqItems = genreFreq.items()
print(f"Genres ({len(genreFreqItems)}):")
for genre, freq in sorted(genreFreqItems, key=itemgetter(1), reverse=True):
  print(f"\t{genre}: {freq}")

del genreFreq

Copied 438832 lines out of 10822149 (4.05%)
Genres (26):
	Drama: 157544
	Documentary: 117247
	Comedy: 76134
	Action: 37525
	Thriller: 32705
	Romance: 31812
	Horror: 28477
	Crime: 24305
	Adventure: 17393
	Biography: 15654
	Family: 14569
	Mystery: 13438
	History: 11945
	Music: 11287
	Fantasy: 11047
	Sci-Fi: 9033
	Animation: 7906
	Adult: 6560
	Sport: 6310
	War: 4839
	Musical: 4576
	News: 1634
	Western: 1302
	Reality-TV: 493
	Talk-Show: 163
	Game-Show: 21


In [7]:
# Encode genre list for each movie
genreList = list(i[0] for i in genreFreqItems)

inpath = "data/title.basics_trimmed.tsv"
outpath = "data/title.basics_genres_encoded.tsv"

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    
    encodedGenres = ""
    genres = str(l[8]).split(sep=',')
    for genre in genreList:
      if genre in genres:
        encodedGenres += '1'
      else:
        encodedGenres += '0' 
    newRow = l[:8]
    newRow.append(encodedGenres)
    writer.writerow(newRow)

In [8]:
# Trim out duplicate references to same movie

inpath = "data/title.basics_genres_encoded.tsv"
outpath = "data/title_unique.tsv"

observedFeatures = set()

numDuplicates = 0

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    features = str(l[1:])
    if not features in observedFeatures:
      observedFeatures.add(features)
      writer.writerow(l)
    else:
      numDuplicates += 1    

del observedFeatures
print(f"Removed {numDuplicates} duplicates")

Removed 25975 duplicates


In [14]:
# Trim out unnecessary columns (titleType (1), originalTitle (2), endYear (6))

inpath = "data/title_unique.tsv"
outpath = "data/processed/movies.tsv"

observedFeatures = set()

numDuplicates = 0

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    writer.writerow([l[0], l[3], l[4], l[5], l[7], l[8]])

In [15]:
# Drop movies that do not contain a runtime length value

inpath = "data/processed/movies.tsv"
outpath = "data/processed/moviesRuntimes.tsv"

numRemoved = 0

with open(inpath, "r") as infile, open(outpath, "w") as outfile:
  reader = CSVReader(infile, delimiter="\t", quotechar=None)
  writer = CSVWriter(outfile, delimiter="\t")

  for line in reader:
    l = list(line)
    if (l[4] != "\\N"):
      writer.writerow(l)
    else:
      numRemoved += 1

print(f"Removed {numRemoved} entries")

Removed 97840 entries
