# Q3

*   We will develop a NER system specific to the category of names of the top 1000 movie titles from IMDB.

*   We will evaluate the system on a collection of text likely to contain instances of these named entities.

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import re
import csv
import math
import nltk
nltk.download('brown')
nltk.download('movie_reviews')
from nltk.corpus import brown, movie_reviews
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [3]:
def get_top_1000_list():
    """
    Function to extract movie titles from a IMDB-top-1000.csv file.

    Returns:
        list: A list of unique titles of the top 1000 movies
    """
    titles = []

    # Read the CSV file
    with open("data/IMDB-top-1000.csv", newline="", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header row
        for row in reader:
            # Assuming the title is in the first column
            title = row[1].strip()  # Remove leading/trailing whitespaces
            titles.append(title)
    # Remove duplicate titles and return the list
    collected_titles = list(set(titles))

    return collected_titles

In [4]:
def label_BIO(_tokens, _NE):
    """
    Generates BIO (Beginning, Inside, Outside) tags for movie titles in the given tokens.

    Args:
        _tokens (list): List of tokens representing words in a sentence.
        _NE (list): List of named entities, where each entity is represented as a list of tokens.

    Returns:
        list: List of tuples containing tokens and their corresponding BIO tags.

    Comments:
        - This function searches for movie titles in the tokens and labels them using BIO notation.
        - A movie title is considered to be a named entity, where the first word is labeled as 'B-MOV'
          (Beginning of a movie title) and subsequent words are labeled as 'I-MOV' (Inside a movie title).
        - Non-movie title tokens are labeled as 'O' (Outside any named entity).
        - The function iterates through each token in the tokens list, searching for matches in the named entity list.
          If a match is found, the corresponding tokens are labeled accordingly in the BIO format.
        - It returns a list of tuples, each containing a token and its corresponding BIO tag.
    """
    BIO_for_samples = []
    named_entities = [title.split() for title in _NE]
    # find words in NE to look up faster
    named_entity_tokens = set(token for ne in named_entities for token in ne)

    for token in _tokens:
        if token in named_entity_tokens:
            # find if token is part of NE
            if any(token == ne[0] for ne in named_entities):
                # if the token is first part of NE
                BIO_for_samples.append((token, "B-MOV"))
            else:
                # if token is inside NE
                BIO_for_samples.append((token, "I-MOV"))
        else:
            # if token is not in NE
            BIO_for_samples.append((token, "O"))

    return BIO_for_samples


In [5]:
# Don't change this cell
def print_BIO_res(_BIO):
    for i in range(len(_BIO)):
        if _BIO[i][1] == 'B-MOV':
            for j in range(i - 7, i + 7):
                if _BIO[j][1] == 'O':
                    print(_BIO[j][0], end=" ")
                else:
                    print(_BIO[j], end=" ")
            print("")

In [6]:
# Don't change this cell
def get_data_from_file(_fn):
    with open(_fn, 'r') as file:
        data = file.read().replace('\n', ' ')
    return data

In [7]:
titles_top_1000 = get_top_1000_list()

# get text data from a text file
data = get_data_from_file("data/article-about-a-genre.txt")
# tokenize text data
tokens = word_tokenize(data)
# tag with BIO using the IMDB top 1000 movie title list
BIO = label_BIO(tokens, titles_top_1000)

print_BIO_res(BIO)

generation ('of', 'I-MOV') kung fu cinema fans . ('From', 'B-MOV') Bruce Lee ('to', 'I-MOV') Shang-Chi : ('a', 'I-MOV') 
('Ten', 'I-MOV') Rings is shaping up ('to', 'I-MOV') overtake ('Black', 'B-MOV') Widow ('as', 'I-MOV') ('the', 'I-MOV') biggest film ('of', 'I-MOV') 
('the', 'I-MOV') biggest film ('of', 'I-MOV') ('the', 'I-MOV') pandemic . ('A', 'B-MOV') hit ('with', 'I-MOV') critics ('and', 'I-MOV') audience alike 
history almost ('as', 'I-MOV') long cinema itself . ('This', 'B-MOV') history is ('on', 'I-MOV') exciting display ('in', 'I-MOV') 
heroes ('with', 'I-MOV') supernatural martial arts abilities . ('Fight', 'B-MOV') scenes ('in', 'I-MOV') these early films emphasised 
rarely showcased actual martial arts skills . ('This', 'B-MOV') changed ('with', 'I-MOV') ('the', 'I-MOV') transformation ('of', 'I-MOV') Hong 
Five Deadly Venoms ( 1978 ) ('and', 'I-MOV') ('The', 'B-MOV') 36th Chamber ('of', 'I-MOV') Shaolin ( 1978 
style , ('as', 'I-MOV') shown ('in', 'I-MOV') films like ('T