# Q3

*   We will develop a NER system specific to the category of names of the top 1000 movie titles from IMDB.

*   We will evaluate the system on a collection of text likely to contain instances of these named entities.

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import re
import csv
import math
import nltk
nltk.download('brown')
nltk.download('movie_reviews')
from nltk.corpus import brown, movie_reviews
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [50]:
def get_top_1000_list():
    """
    Function to extract movie titles from a IMDB-top-1000.csv file.

    Returns:
        list: A list of unique titles of the top 1000 movies
    """
    collected_titles = []
    with open('IMDB-top-1000.csv', 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header
        for row in reader:
            # movie titles are in the second column
            # for word in row[1].split(): # method 1
            #     collected_titles.append(word)

            collected_titles.append(row[1].split()) # method 2

    # collected_titles = list(set(collected_titles))
    collected_titles = sorted(collected_titles, key=lambda x: len(x), reverse=True)

    return collected_titles

In [51]:
get_top_1000_list()[:40]

[['Dr.',
  'Strangelove',
  'or:',
  'How',
  'I',
  'Learned',
  'to',
  'Stop',
  'Worrying',
  'and',
  'Love',
  'the',
  'Bomb'],
 ['The', 'Lord', 'of', 'the', 'Rings:', 'The', 'Return', 'of', 'the', 'King'],
 ['The',
  'Lord',
  'of',
  'the',
  'Rings:',
  'The',
  'Fellowship',
  'of',
  'the',
  'Ring'],
 ['Pirates',
  'of',
  'the',
  'Caribbean:',
  'The',
  'Curse',
  'of',
  'the',
  'Black',
  'Pearl'],
 ['Star', 'Wars:', 'Episode', 'V', '-', 'The', 'Empire', 'Strikes', 'Back'],
 ['Star', 'Wars:', 'Episode', 'III', '-', 'Revenge', 'of', 'the', 'Sith'],
 ['The', 'Naked', 'Gun:', 'From', 'the', 'Files', 'of', 'Police', 'Squad!'],
 ['The', 'Lord', 'of', 'the', 'Rings:', 'The', 'Two', 'Towers'],
 ['Harry', 'Potter', 'and', 'the', 'Deathly', 'Hallows:', 'Part', '2'],
 ['Star', 'Wars:', 'Episode', 'VII', '-', 'The', 'Force', 'Awakens'],
 ['Harry', 'Potter', 'and', 'the', 'Deathly', 'Hallows:', 'Part', '1'],
 ['Fried', 'Green', 'Tomatoes', 'at', 'the', 'Whistle', 'Stop', 'Cafe']

In [52]:
def label_BIO(_tokens, _NE):
    """
    Generates BIO (Beginning, Inside, Outside) tags for movie titles in the given tokens.

    Args:
        _tokens (list): List of tokens representing words in a sentence.
        _NE (list): List of named entities, where each entity is represented as a list of tokens.

    Returns:
        list: List of tuples containing tokens and their corresponding BIO tags.

    Comments:
        - This function searches for movie titles in the tokens and labels them using BIO notation.
        - A movie title is considered to be a named entity, where the first word is labeled as 'B-MOV'
          (Beginning of a movie title) and subsequent words are labeled as 'I-MOV' (Inside a movie title).
        - Non-movie title tokens are labeled as 'O' (Outside any named entity).
        - The function iterates through each token in the tokens list, searching for matches in the named entity list.
          If a match is found, the corresponding tokens are labeled accordingly in the BIO format.
        - It returns a list of tuples, each containing a token and its corresponding BIO tag.
    """
    BIO_for_samples = []

    # for token in _tokens: # method 1 # this method needs tokens be unique words in csv file but had some false predictions 
    #     if token in _NE:
    #         # BIO_for_samples.append((token, 'B-MOV'))
    #         if (len(BIO_for_samples) == 0): # first named entity
    #             BIO_for_samples.append((token, 'B-MOV'))
    #         elif BIO_for_samples[-1][1] == 'B-MOV' or BIO_for_samples[-1][1] == 'I-MOV':
    #             BIO_for_samples.append((token, 'I-MOV'))
    #         else:
    #             BIO_for_samples.append((token, 'B-MOV'))
    #     else:
    #         BIO_for_samples.append((token, 'O'))

    i = 0 # method 2
    while (i < len(_tokens)):
        token = _tokens[i]
        is_in_NE = False
        for named_entity in _NE:
            found = True
            if (token == named_entity[0]):
                for j in range(1, len(named_entity)):
                    # if (_tokens[i + j] == len(_tokens)):
                    if (i + j == len(_tokens)):
                        found = False
                        break
                    if (named_entity[j] != _tokens[i + j]):
                        found = False
                        break
                if (found):
                    BIO_for_samples.append((token, 'B-MOV'))
                    is_in_NE = True
                    for j in range(1, len(named_entity)):
                        BIO_for_samples.append((_tokens[i + j], 'I-MOV'))
                    i = i + len(named_entity)
                    break # for named_entity in _NE:

        if not is_in_NE:
            BIO_for_samples.append((token, 'O'))
            i += 1

    return BIO_for_samples

In [17]:
# Don't change this cell
def print_BIO_res(_BIO):
    for i in range(len(_BIO)):
        if _BIO[i][1] == 'B-MOV':
            for j in range(i - 7, i + 7):
                if _BIO[j][1] == 'O':
                    print(_BIO[j][0], end=" ")
                else:
                    print(_BIO[j], end=" ")
            print("")

In [18]:
# Don't change this cell
def get_data_from_file(_fn):
    with open(_fn, 'r') as file:
        data = file.read().replace('\n', ' ')
    return data

In [53]:
titles_top_1000 = get_top_1000_list()

# get text data from a text file
# data = get_data_from_file("data/article-about-a-genre.txt")
data = get_data_from_file("article-about-a-genre.txt")
# tokenize text data
tokens = word_tokenize(data)
# tag with BIO using the IMDB top 1000 movie title list
BIO = label_BIO(tokens, titles_top_1000)

print_BIO_res(BIO)

Ten Rings is shaping up to overtake ('Black', 'B-MOV') Widow as the biggest film of 
. With films like Chan ’ s ('Rush', 'B-MOV') Hour ( 1998 ) and Shanghai 
to find its way into hits like ('The', 'B-MOV') ('Matrix', 'I-MOV') ( 1999 ) and Kill 
the trend . Jet Li ’ s ('Hero', 'B-MOV') ( 2002 ) and Fearless ( 
comedies Shaolin Soccer ( 2001 ) and ('Kung', 'B-MOV') ('Fu', 'I-MOV') ('Hustle', 'I-MOV') ( 2004 ) , 
) , and Donnie Yen ’ s ('Ip', 'B-MOV') ('Man', 'I-MOV') ( 2008 ) . Shang-Chi 
