# Reading Movies

In [36]:
import ast
import csv
movies = []
with open('results.csv') as f:
  reader = csv.DictReader(f)
  lst = list(reader)
  for item in lst:
    item["query_results"] = ast.literal_eval(item["query_results"])
    item["top_10_results"] = item["query_results"][:10]
    movies.append(item)

exclude_channels = []
with open("Hand_Excluded_Channels.csv") as f:
  reader = csv.reader(f)
  lst = reader
  for item in lst:
    exclude_channels.append(item[0])

with open("Programmatically_Excluded_Channels.csv") as f:
  reader = csv.reader(f)
  lst = reader
  for item in lst:
    exclude_channels.append(item[0])

known_channels = []
with open("Hand_Known_Channels.csv") as f:
  reader = csv.reader(f)
  lst = reader
  for item in lst:
    known_channels.append(item[0])

In [37]:
len(movies)

1976

In [38]:
import re
def translate(to_translate):
		tabin = u'áéíóú'
		tabout = u'aeiou'
		tabin = [ord(char) for char in tabin]
		translate_table = dict(zip(tabin, tabout))
		return to_translate.translate(translate_table)

def remove_non_alphanumeric(result):
    return re.sub(r'[^a-zA-Z0-9]', '', result)

def normalize(token):
		result = token.lower()
		result = translate(result)
		result = remove_non_alphanumeric(result)
		return result

In [39]:
from datetime import date

def diff_dates(date1, date2):
    return abs(date2 - date1).days


def anterior(date1, date2):
  return (date2 - date1).days >= 0

def remove_results_by_date(release_start, records):
        anterior_query_results = []
        for query_result in records:
            title, channel, video_id, result_date_str, premiered, stream, subscribers, views, likes, verified = query_result

            if release_start == None or result_date_str == None or release_start == "" or result_date_str == "":
                is_anterior = True
            else:
                #print("{} {}".format(repr(release_start), result_date_str))
                movie_moth, movie_day, movie_year = release_start.split("/")
                result_moth, result_day, result_year = result_date_str.split("/")
                movie_date = date(int(movie_year), int(movie_moth), int(movie_day))
                result_date = date(int(result_year), int(result_moth), int(result_day))
                is_anterior = anterior(result_date, movie_date)

            #self.logger.debug("Movie Date {},  Result Date {}, Diff {} Result Date Is Anterior? {}".format(release_start, result_date_str, diff_dates(movie_date, result_date), is_anterior))

            if is_anterior:
                anterior_query_results.append(query_result)
        return anterior_query_results

In [40]:
def remove_results_by_channel_known_or_verified(records):
    results = []
    for result in records:
        title, channel, video_id, result_date_str, premiered, stream, subscribers, views, likes, verified = result
        if channel.lower() in known_channels:
            results.append(result)
        elif verified == "verified":
            results.append(result)
    return results

In [41]:
def exclude_by_channel(records):
    results = []
    for result in records:
        title, channel, video_id, result_date_str, premiered, stream, subscribers, views, likes, verified = result
        if channel.lower() not in exclude_channels:
            results.append(result)
    return results
  

In [42]:
url = "https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK's%2520list%2520of%2520english%2520stopwords"
import requests
empty_words = requests.get(url).text.split("\n")

import re
def translate(to_translate):
		tabin = u'áéíóú'
		tabout = u'aeiou'
		tabin = [ord(char) for char in tabin]
		translate_table = dict(zip(tabin, tabout))
		return to_translate.translate(translate_table)

def remove_non_alphanumeric(result):
    return re.sub(r'[^a-zA-Z0-9]', '', result)

def normalize_token(token):
		result = token.lower()
		result = translate(result)
		result = remove_non_alphanumeric(result)
		return result

def normalize(sentence):
    normalized_title_tokens = []
    sentence_words = sentence.split()
    for word in sentence_words:
        normalized_title_token = normalize_token(word)
        if len(normalized_title_token) > 1:
            normalized_title_tokens.append(normalized_title_token)
    return normalized_title_tokens

def all_empty_tokens(tokens):
    for token in tokens:
        if token not in empty_words:
            return False
    return True

def remove_empty_words(tokens):
    result_tokens = []
    for token in tokens:
        if token not in empty_words:
            result_tokens.append(token)
    return result_tokens
    

def contains_at_least_one_word(movie_tokens, result_title_tokens):
    for movie_token in movie_tokens:
        if movie_token in result_title_tokens:
            return True
    return False

def remove_results_by_title(movie_name, records):
    results = []
    movie_tokens = normalize(movie_name)
    
    if all_empty_tokens(movie_tokens):
        remove_empty_words_flag = False
    else:
        remove_empty_words_flag = True
        movie_tokens = remove_empty_words(movie_tokens)
        
        
    for result in records:
        title, channel, video_id, result_date_str, premiered, stream, subscribers, views, likes, verified = result
        result_title_tokens = normalize(title)
        if remove_empty_words_flag:
            result_title_tokens = remove_empty_words(result_title_tokens)
            
        if contains_at_least_one_word(movie_tokens, result_title_tokens):
            results.append(result)
    return results

In [43]:
def add_3_relevant(records, actual_candidates):
    new_candidates = []
    added_count = 0
    for record in records:
        if record not in actual_candidates:
            new_candidates.append(record)
            added_count += 1
            if added_count == 3:
                break
        else:
            new_candidates.append(record)
    return new_candidates

In [44]:
empty_count = 0
non_empty_count = 0  
for movie in movies:
    date_filtered = remove_results_by_date(movie["release_start"], movie["top_10_results"])
    # Empty Count: 922, Non Empty Count: 1054
    if date_filtered == []:
        date_filtered = remove_results_by_date(movie["release_start"], movie["query_results"])
        # Empty Count: 886, Non Empty Count: 1090
        if date_filtered == []:
            continue
        
    previous_and_channel_excluded_filtered = exclude_by_channel(date_filtered)
    # Empty Count: 9, Non Empty Count: 1081
    
    previous_and_channel_known_or_verified_filtered = remove_results_by_channel_known_or_verified(previous_and_channel_excluded_filtered)
    # Empty Count: 209, Non Empty Count: 881

    previous_and_at_least_one_word_filtered = remove_results_by_title(movie["movie_name"], previous_and_channel_known_or_verified_filtered)
    # Empty Count: 216, Non Empty Count: 874

    
    if previous_and_at_least_one_word_filtered == []:
        empty_count += 1
    else:
        non_empty_count += 1
    
    if len(previous_and_at_least_one_word_filtered) < 3:
        candidates = add_3_relevant(movie["top_10_results"], previous_and_at_least_one_word_filtered)
    else:
        candidates = previous_and_at_least_one_word_filtered
    #candidates = previous_and_at_least_one_word_filtered
    
    movie["candidates"] = candidates
    
print("Empty Count: {}, Non Empty Count: {}".format(empty_count, non_empty_count))

Empty Count: 216, Non Empty Count: 874


In [45]:
for movie in movies:
    if movie["no_prerelease"] == '0':
        if movie["candidates"] == []:
            print("Error")
            
for movie in movies:
    if movie["no_prerelease"] == '1':
        if "candidates" in movie.keys():
            print("Error")
            
for movie in movies:
    if movie["no_prerelease"] == '0':
        for trailer in movie["candidates"]:
            title, channel, video_id, date, premiered, stream, subscribers, views, likes, verified = trailer
            if stream:
                print("Stream in candidates. Movie: {}".format(movie["movie_name"]))
            if premiered:
                print("Premiered in candidates. Movie: {}".format(movie["movie_name"]))

Stream in candidates. Movie: Avengers: Infinity War
Premiered in candidates. Movie: It Chapter Two
Premiered in candidates. Movie: The Amazing Spider-Man 2
Premiered in candidates. Movie: Sonic the Hedgehog
Stream in candidates. Movie: Fantastic Beasts: The Secrets of Dumbledore
Premiered in candidates. Movie: The Lego Movie 2: The Second Part
Premiered in candidates. Movie: Hustlers
Premiered in candidates. Movie: Space Jam: A New Legacy
Premiered in candidates. Movie: Space Jam: A New Legacy
Premiered in candidates. Movie: A Christmas Carol
Premiered in candidates. Movie: Terminator: Dark Fate
Premiered in candidates. Movie: About Last Night
Premiered in candidates. Movie: Pet Sematary
Stream in candidates. Movie: Fahrenheit 9/11
Premiered in candidates. Movie: Goosebumps
Premiered in candidates. Movie: Jackass Forever
Premiered in candidates. Movie: It's Complicated
Premiered in candidates. Movie: Cruella
Premiered in candidates. Movie: Law Abiding Citizen
Premiered in candidates. M

In [46]:
# Creating manual file

In [47]:
for movie in movies:
    if movie["movie_name"] == "Criminal":
        for candidate in movie["candidates"]:
            print(candidate)

['Criminal (2016 Movie) Official Trailer – “Remember”', 'Lionsgate Movies', '3bvnoqsvY-M', '2/11/2016', False, False, '1.86M', '4,513,150', '8,228', 'verified']
['CRIMINAL Official Trailer (2016)', 'KinoCheck.com', 'aHghr4qTpJI', '2/15/2016', False, False, '4.22M', '213,509', '1,359', 'verified']
['Criminal ( Official Trailer ) Neeru Bajwa | Dheeraj Kumar | Prince Kanwaljit | Raghveer Boli', 'Humble Motion Pictures', 'McqMEmRdy7E', '9/7/2022', False, False, '127K', '8,881,240', '226,321', 'verified']
['Michael Jackson - Smooth Criminal (Official Video)', 'Michael Jackson', 'h_D3VFfhvs4', '11/19/2010', False, False, '26.9M', '787,139,786', '8,228,361', 'not verified']


In [48]:
with open("file.csv", "w") as f:
    f.write("movie_name,video_url,video_title,channel,official_trailer,colour")
    f.write("\r\n")
    colour = 0
    for movie in movies:
        if movie["no_prerelease"] == '0':
            for trailer in movie["candidates"]:
                title, channel, video_id, date, premiered, stream, subscribers, views, likes, verified = trailer
                youtube_url = "https://www.youtube.com/watch?v={}".format(video_id)
                movie_name = movie["movie_name"].replace(",","")
                title = title.replace(",","")
                channel = channel.replace(",","")
                f.write("{},{},{},{},{},{}".format(movie_name, youtube_url, title, channel, 0, colour))
                f.write("\r\n")
            f.write(",,,,,")
            f.write("\r\n")
            if colour == 0:
                colour = 1
            else:
                colour = 0

In [49]:
with open("./Debug_Files/query_results.csv", "w") as f:
    f.write("movie_name,video_url,video_title,channel")
    f.write("\r\n")
    for movie in movies:
        for result in movie["query_results"]:
            title, channel, video_id, date, premiered, stream, subscribers, views, likes, verified = result
            youtube_url = "https://www.youtube.com/watch?v={}".format(video_id)
            movie_name = movie["movie_name"].replace(",","")
            title = title.replace(",","")
            channel = channel.replace(",","")
            f.write("{},{},{},{}".format(movie_name, youtube_url, title, channel))
            f.write("\r\n")
        f.write(",,,,")
        f.write("\r\n")

In [50]:
with open("./Debug_Files/top_10_results.csv", "w") as f:
    f.write("movie_name,video_url,video_title,channel")
    f.write("\r\n")
    for movie in movies:
        for result in movie["top_10_results"]:
            title, channel, video_id, date, premiered, stream, subscribers, views, likes, verified = result
            youtube_url = "https://www.youtube.com/watch?v={}".format(video_id)
            movie_name = movie["movie_name"].replace(",","")
            title = title.replace(",","")
            channel = channel.replace(",","")
            f.write("{},{},{},{}".format(movie_name, youtube_url, title, channel))
            f.write("\r\n")
        f.write(",,,,")
        f.write("\r\n")

In [51]:
"""
with open("file.csv", "w") as f:
    f.write("movie_name,video_url,video_title,channel,official_trailer,colour")
    f.write("\r\n")
    colour = 0
    for movie in movies:
        if movie["no_prerelease"] == '0':
            if movie["candidates"] == []:
                movie_name = movie["movie_name"].replace(",","")
                f.write("{},{},{},{},{},{}".format(movie_name, None, None, None, 0, 2))
                f.write("\r\n")
            for trailer in movie["candidates"]:
                title, channel, video_id, date, premiered, stream, subscribers, views, likes, score_result = trailer
                youtube_url = "https://www.youtube.com/watch?v={}".format(video_id)
                movie_name = movie["movie_name"].replace(",","")
                title = title.replace(",","")
                channel = channel.replace(",","")
                f.write("{},{},{},{},{},{}".format(movie_name, youtube_url, title, channel, 0, colour))
                f.write("\r\n")
            f.write(",,,,,")
            f.write("\r\n")
            if colour == 0:
                colour = 1
            else:
                colour = 0
"""

'\nwith open("file.csv", "w") as f:\n    f.write("movie_name,video_url,video_title,channel,official_trailer,colour")\n    f.write("\r\n")\n    colour = 0\n    for movie in movies:\n        if movie["no_prerelease"] == \'0\':\n            if movie["candidates"] == []:\n                movie_name = movie["movie_name"].replace(",","")\n                f.write("{},{},{},{},{},{}".format(movie_name, None, None, None, 0, 2))\n                f.write("\r\n")\n            for trailer in movie["candidates"]:\n                title, channel, video_id, date, premiered, stream, subscribers, views, likes, score_result = trailer\n                youtube_url = "https://www.youtube.com/watch?v={}".format(video_id)\n                movie_name = movie["movie_name"].replace(",","")\n                title = title.replace(",","")\n                channel = channel.replace(",","")\n                f.write("{},{},{},{},{},{}".format(movie_name, youtube_url, title, channel, 0, colour))\n                f.writ