In [None]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_wikipedia_movie_page(url):
    """
    Scrapes the text content of a Wikipedia movie page and preprocesses it.
    Args:
        url (str): URL of the Wikipedia movie page.
    Returns:
        str: Cleaned and extracted text from the page.
    """
    try:
        # Send an HTTP request to the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract all text within paragraphs
        paragraphs = soup.find_all('p')
        page_text = ""
        for para in paragraphs:
            page_text += para.get_text()
        
        # Preprocess the text: remove citations and special characters
        cleaned_text = re.sub(r'\[.*?\]', '', page_text)  # Remove citations like [1], [2]
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,]', '', cleaned_text)  # Remove special characters
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Normalize whitespace
        
        return cleaned_text
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example Usage
url = "https://en.wikipedia.org/wiki/Kaho_Na..._Pyaar_Hai"  # Replace with the Wikipedia page URL of the movie
movie_text = scrape_wikipedia_movie_page(url)

if movie_text:
    print("Extracted Movie Text:")
    print(movie_text[:1000])  # Display first 1000 characters for brevity


In [None]:
import spacy
from collections import Counter

# Load a pre-trained NLP model
nlp = spacy.load("en_core_web_sm")

def analyze_gender_stereotypes(text):
    """
    Analyzes gender roles and stereotypes in the text.
    Args:
        text (str): Cleaned movie text.
    Returns:
        dict: Gender stereotype analysis results.
    """
    # Process the text with SpaCy
    doc = nlp(text)
    
    # Pronoun analysis
    male_pronouns = ['he', 'him', 'his']
    female_pronouns = ['she', 'her']
    male_count = Counter([token.text.lower() for token in doc if token.text.lower() in male_pronouns])
    female_count = Counter([token.text.lower() for token in doc if token.text.lower() in female_pronouns])
    
    # Named Entity Recognition
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    professions = [ent[0] for ent in entities if ent[1] in ['PERSON', 'ORG', 'TITLE']]
    
    # Context analysis: Extract sentences with gendered pronouns
    male_contexts = [sent.text for sent in doc.sents if any(pronoun in sent.text.lower() for pronoun in male_pronouns)]
    female_contexts = [sent.text for sent in doc.sents if any(pronoun in sent.text.lower() for pronoun in female_pronouns)]
    
    # Results
    results = {
        "male_pronoun_count": male_count,
        "female_pronoun_count": female_count,
        "professions_mentioned": professions,
        "male_contexts": male_contexts[:5],  # Limit output for readability
        "female_contexts": female_contexts[:5],  # Limit output for readability
    }
    return results

# Example Usage
if movie_text:
    analysis_results = analyze_gender_stereotypes(movie_text)
    
    print("Gender Stereotype Analysis:")
    print(f"Male Pronoun Counts: {analysis_results['male_pronoun_count']}")
    print(f"Female Pronoun Counts: {analysis_results['female_pronoun_count']}")
    print(f"Professions Mentioned: {analysis_results['professions_mentioned']}")
    print("\nSample Male Contexts:")
    print("\n".join(analysis_results['male_contexts']))
    print("\nSample Female Contexts:")
    print("\n".join(analysis_results['female_contexts']))
