In [157]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
import re
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Download necessary NLTK data packages
nltk.download('stopwords')
nltk.download('punkt')

# Load stop words
stop_words = set(stopwords.words('english'))

# Load JSON data into DataFrame
df = pd.read_json("/content/gg2013.json")

# Optimized text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https?:\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\brt\b', '', text)  # Remove "RT"
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[!"#$%&\'()*+,\-./:;<=>?@[\\\]^_`{|}~]', ' ', text)  # Remove punctuation
    return text.strip()

# Process text data and add a new column for cleaned text
df['cleaned_text'] = df['text'].apply(clean_text)

# Function to extract proper nouns (potential names) using spaCy
def get_proper_nouns(text):
    spacy_doc = nlp(text)
    names = []
    temp_name = []

    for token in spacy_doc:
        if token.tag_ == "NNP" and not token.is_stop and token.text not in stop_words and len(token.text) > 1:
            temp_name.append(token.text)
        else:
            if temp_name:
                names.append(" ".join(temp_name))
                temp_name = []

    if temp_name:
        names.append(" ".join(temp_name))

    return names

# Function to extract award names
def extract_award_name(text):
    pattern = re.compile(r'best (?:actress|actor|screenplay|motion picture|director|screenplay|original score|original song|animated feature film)(?: (?:in a )?(?:motion picture|mini-series|tv|movie|film|drama|comedy or musical))?', re.IGNORECASE)
    matches = pattern.findall(text)
    return ', '.join(set(matches)) if matches else None

# Apply function to DataFrame to extract award names
df['award_names'] = df['cleaned_text'].apply(extract_award_name)

# Filter rows with extracted award names
df_with_awards = df[df['award_names'].notna()]

# Display value counts of award names
award_name_counts = df_with_awards['award_names'].value_counts()

# Function to extract entities based on keyword and return value counts
def extract_entities(find_str, df):
    filtered_df = df[df["cleaned_text"].str.contains(find_str, case=False)].reset_index(drop=True)
    entities_list = filtered_df["cleaned_text"].apply(lambda x: get_proper_nouns(x))
    all_entities = [entity for sublist in entities_list for entity in sublist]
    entity_counts_df = pd.DataFrame(all_entities, columns=['entity'])
    return entity_counts_df['entity'].value_counts()

# Function to fetch hosts
def fetch_hosts(df):
    host_counts = extract_entities("host|Host", df)
    top_hosts = host_counts[:2].index.tolist()
    return ' and '.join(top_hosts)

# Function to fetch nominees for each award
def fetch_nominees(df_with_awards):
    nominees = {}
    for award in df_with_awards['award_names'].unique():
        award_counts = extract_entities(award, df)
        top_nominees = award_counts[:5].index.tolist()
        nominees[award] = ', '.join(top_nominees)
    return nominees

# Function to fetch presenters
def fetch_presenters(df):
    presenter_counts = extract_entities("presenter|Presenter", df)
    top_presenters = presenter_counts[:5].index.tolist()
    return ' and '.join(top_presenters)

# Function to fetch winners
def fetch_winners(df_with_awards):
    winners = {}
    for award in df_with_awards['award_names'].unique():
        award_counts = extract_entities(f"wins {award}|won {award}", df)
        if not award_counts.empty:
            winners[award] = award_counts.index[0]
    return winners

# Function to fetch most mentioned name
def fetch_most_mentioned_name(df):
    name_counts = extract_entities("name|Name", df)
    name_counts = name_counts[name_counts > 1]  # Filter out single occurrences
    return name_counts.index[0] if not name_counts.empty else None

# Function to fetch most mentioned movie
def fetch_most_mentioned_movie(df):
    movie_counts = extract_entities("movie|Movie|film|Film", df)
    return movie_counts.index[0] if not movie_counts.empty else None

# Function to fetch most deserved winner
def fetch_most_deserved_name(df):
    deserved_counts = extract_entities("deserved|Deserved", df)
    return deserved_counts.index[0] if not deserved_counts.empty else None

# Function to aggregate results and print them
def print_results(df, df_with_awards):
    results = {}
    hosts = fetch_hosts(df)
    results["Host"] = hosts

    nominees = fetch_nominees(df_with_awards)
    winners = fetch_winners(df_with_awards)
    presenters = fetch_presenters(df)

    for award in df_with_awards['award_names'].unique():
        results[award] = {
            "Presenters": presenters.split(' and '),
            "Nominees": nominees.get(award, "").split(', '),
            "Winner": winners.get(award, "Unknown")
        }

    most_mentioned_name = fetch_most_mentioned_name(df)
    most_mentioned_movie = fetch_most_mentioned_movie(df)
    most_deserved_name = fetch_most_deserved_name(df)

    results["Most Mentioned Name"] = most_mentioned_name
    results["Most Mentioned Movie"] = most_mentioned_movie
    results["Most Deserved Winner"] = most_deserved_name

    # Print human-readable results
    for award, details in results.items():
        if isinstance(details, dict):
            print(f'"{award}" : {{')
            print(f'    "Presenters" : {details["Presenters"]},')
            print(f'    "Nominees" : {details["Nominees"]},')
            print(f'    "Winner" : "{details["Winner"]}"')
            print('},')
        else:
            print(f'{award}: {details}')

# Execute functions and print results
print_results(df, df_with_awards)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Host: tina fey and tina
"best actress" : {
    "Presenters" : ['bill clinton', 'united states', 'nd president', 'amp', 'tommy lee jones'],
    "Nominees" : ['jennifer lawrence', 'jessica chastain', 'julianne moore', 'silver', 'lena dunham'],
    "Winner" : "jennifer lawrence"
},
"best actor" : {
    "Presenters" : ['bill clinton', 'united states', 'nd president', 'amp', 'tommy lee jones'],
    "Nominees" : ['hugh', 'daniel', 'lewis', 'cheadle', 'kevin costner'],
    "Winner" : "hugh"
},
"best actress drama" : {
    "Presenters" : ['bill clinton', 'united states', 'nd president', 'amp', 'tommy lee jones'],
    "Nominees" : ['jessica chastain', 'jennifer lawrence', 'watts', 'lea michele', 'tan overdose'],
    "Winner" : "jessica chastain"
},
"best screenplay" : {
    "Presenters" : ['bill clinton', 'united states', 'nd president', 'amp', 'tommy lee jones'],
    "Nominees" : ['quentin', 'django', 'robert pattinson', 'amanda', 'quentin tarantino'],
    "Winner" : "quentin"
},
"best directo