# Q3

*   We will develop a NER system specific to the category of names of the top 1000 movie titles from IMDB.

*   We will evaluate the system on a collection of text likely to contain instances of these named entities.

In [44]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [45]:
import re
import csv
import math
import nltk
nltk.download('brown')
nltk.download('movie_reviews')
from nltk.corpus import brown, movie_reviews
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [46]:
def get_top_1000_list():
    """
    Function to extract movie titles from a IMDB-top-1000.csv file.

    Returns:
        list: A list of unique titles of the top 1000 movies
    """
    movie_titles = []
    csv_file_path = 'IMDB-top-1000.csv'
    with open(csv_file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
          # print (type(row))
          # print("Row is: ", row)
            title = row[1]
            movie_titles.append(title.split())

    # collected_titles =  list(set(movie_titles))  # Return unique movie titles
    collected_titles = movie_titles
    return collected_titles

In [47]:
# Example usage
movie_titles = get_top_1000_list()
print(movie_titles)

[['The', 'Shawshank', 'Redemption'], ['The', 'Godfather'], ['The', 'Dark', 'Knight'], ['The', 'Lord', 'of', 'the', 'Rings:', 'The', 'Return', 'of', 'the', 'King'], ["Schindler's", 'List'], ['The', 'Godfather', 'Part', 'II'], ['12', 'Angry', 'Men'], ['Jai', 'Bhim'], ['Pulp', 'Fiction'], ['Inception'], ['The', 'Lord', 'of', 'the', 'Rings:', 'The', 'Two', 'Towers'], ['Fight', 'Club'], ['The', 'Lord', 'of', 'the', 'Rings:', 'The', 'Fellowship', 'of', 'the', 'Ring'], ['Forrest', 'Gump'], ['The', 'Good,', 'the', 'Bad', 'and', 'the', 'Ugly'], ['Soorarai', 'Pottru'], ['The', 'Matrix'], ['Goodfellas'], ['Star', 'Wars:', 'Episode', 'V', '-', 'The', 'Empire', 'Strikes', 'Back'], ['One', 'Flew', 'Over', 'the', "Cuckoo's", 'Nest'], ['Top', 'Gun:', 'Maverick'], ['Interstellar'], ['City', 'of', 'God'], ['Spirited', 'Away'], ['Saving', 'Private', 'Ryan'], ['The', 'Green', 'Mile'], ['Life', 'Is', 'Beautiful'], ['Seven'], ['Terminator', '2:', 'Judgment', 'Day'], ['The', 'Silence', 'of', 'the', 'Lambs'],

In [48]:
def label_BIO(_tokens, _NE):
    """
    Generates BIO (Beginning, Inside, Outside) tags for movie titles in the given tokens.

    Args:
        _tokens (list): List of tokens representing words in a sentence.
        _NE (list): List of named entities, where each entity is represented as a list of tokens.

    Returns:
        list: List of tuples containing tokens and their corresponding BIO tags.

    Comments:
        - This function searches for movie titles in the tokens and labels them using BIO notation.
        - A movie title is considered to be a named entity, where the first word is labeled as 'B-MOV'
          (Beginning of a movie title) and subsequent words are labeled as 'I-MOV' (Inside a movie title).
        - Non-movie title tokens are labeled as 'O' (Outside any named entity).
        - The function iterates through each token in the tokens list, searching for matches in the named entity list.
          If a match is found, the corresponding tokens are labeled accordingly in the BIO format.
        - It returns a list of tuples, each containing a token and its corresponding BIO tag.
    """
    BIO_for_samples = []

    for token in _tokens:
        found_NE = False  # Flag to check if the token is part of a named entity

        for ne in _NE:
            if token in ne:
                # for i, ne_token in enumerate(ne):
                #     if i == 0:
                #         BIO_for_samples.append((ne_token, 'B-MOV'))  # First token in the movie title
                #     else:
                #         BIO_for_samples.append((ne_token, 'I-MOV'))  # Subsequent tokens in the movie title
                if ne.index(token) == 0:
                  BIO_for_samples.append((token, 'B-MOV'))  # First token in the movie title
                else:
                  BIO_for_samples.append((token, 'I-MOV'))  # Subsequent tokens in the movie title
                found_NE = True
                break

        if not found_NE:
            BIO_for_samples.append((token, 'O'))  # Token is not part of any named entity

    return BIO_for_samples

In [49]:
# Don't change this cell
def print_BIO_res(_BIO):
    for i in range(len(_BIO)):
        if _BIO[i][1] == 'B-MOV':
            for j in range(i - 7, i + 7):
                if _BIO[j][1] == 'O':
                    print(_BIO[j][0], end=" ")
                else:
                    print(_BIO[j], end=" ")
            print("")

In [50]:
# Don't change this cell
def get_data_from_file(_fn):
    with open(_fn, 'r') as file:
        data = file.read().replace('\n', ' ')
    return data

In [51]:
titles_top_1000 = get_top_1000_list()

# get text data from a text file
data = get_data_from_file("article-about-a-genre.txt")
# tokenize text data
tokens = word_tokenize(data)
# tag with BIO using the IMDB top 1000 movie title list
BIO = label_BIO(tokens, titles_top_1000)

print_BIO_res(BIO)

('Ten', 'I-MOV') Rings is shaping up ('to', 'I-MOV') overtake ('Black', 'B-MOV') Widow ('as', 'I-MOV') ('the', 'I-MOV') biggest film ('of', 'I-MOV') 
('the', 'I-MOV') biggest film ('of', 'I-MOV') ('the', 'I-MOV') pandemic . ('A', 'B-MOV') hit ('with', 'I-MOV') critics ('and', 'I-MOV') audience alike 
history almost ('as', 'I-MOV') long cinema itself . ('This', 'B-MOV') history is ('on', 'I-MOV') exciting display ('in', 'I-MOV') 
heroes ('with', 'I-MOV') supernatural martial arts abilities . ('Fight', 'B-MOV') scenes ('in', 'I-MOV') these early films emphasised 
rarely showcased actual martial arts skills . ('This', 'B-MOV') changed ('with', 'I-MOV') ('the', 'I-MOV') transformation ('of', 'I-MOV') Hong 
Five Deadly Venoms ( 1978 ) ('and', 'I-MOV') ('The', 'B-MOV') 36th Chamber ('of', 'I-MOV') Shaolin ( 1978 
style , ('as', 'I-MOV') shown ('in', 'I-MOV') films like ('The', 'B-MOV') ('Big', 'I-MOV') Boss ( 1971 ) ('and', 'I-MOV') 
('The', 'B-MOV') ('Big', 'I-MOV') Boss ( 1971 ) ('and', 'I