In [None]:


# Step 1: Install required libraries
%pip install transformers datasets nltk spacy langdetect textblob googletrans==4.0.0-rc1 matplotlib seaborn scikit-learn ipywidgets
%python -m spacy download en_core_web_sm
%python -m spacy download de_core_news_sm
%python -m nltk.downloader punkt stopwords vader_lexicon

# Step 2: Import all necessary libraries
import random
import pandas as pd
import numpy as np
import spacy
import nltk
from langdetect import detect
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from transformers import pipeline
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML, display
import ipywidgets as widgets
from googletrans import Translator

# Load spaCy models
nlp_en = spacy.load('en_core_web_sm')
nlp_de = spacy.load('de_core_news_sm')

# Step 3: Define all necessary functions

# Function for language detection
def detect_language(text):
    try:
        lang = detect(text)
        # For simplicity, map all languages to either 'en' or 'de'
        if lang == 'en':
            return 'en'
        elif lang == 'de':
            return 'de'
        else:
            return 'en'  # Default to English for other languages
    except:
        return "en"  # Default to English if detection fails

# Function for tokenization and normalization
def preprocess_text(text, language):
    if language == 'en':
        doc = nlp_en(text)
    elif language == 'de':
        doc = nlp_de(text)
    else:
        # Fallback to NLTK
        tokens = word_tokenize(text)
        return [token.lower() for token in tokens if token.isalpha()]

    # Using spaCy for preprocessing
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Function for named entity recognition
def extract_entities(text, language):
    if language == 'en':
        doc = nlp_en(text)
    elif language == 'de':
        doc = nlp_de(text)
    else:
        return []

    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Simple entity linking (using spaCy NER)
def link_entities(text, language):
    entities = extract_entities(text, language)
    # In a real project, you would link these to Wikidata or another KB
    # For simplicity, we'll just return the entities
    return entities

# Simple sentiment analysis using TextBlob
def analyze_sentiment_textblob(text, language):
    try:
        if language == 'en':
            return TextBlob(text).sentiment.polarity
        elif language == 'de':
            # For German, just use TextBlob (not TextBlobDE to avoid extra dependencies)
            return TextBlob(text).sentiment.polarity
        return 0
    except:
        return 0

# Setup sentiment analyzer using transformers
def setup_sentiment_analyzer():
    try:
        # Load multilingual sentiment analysis model
        model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
        sentiment_analyzer = pipeline("sentiment-analysis", model=model_name)
        return sentiment_analyzer
    except:
        # Fallback function if transformer model fails
        def fallback_analyzer(text):
            polarity = TextBlob(text).sentiment.polarity
            if polarity > 0.1:
                return [{'label': 'POSITIVE', 'score': polarity}]
            elif polarity < -0.1:
                return [{'label': 'NEGATIVE', 'score': abs(polarity)}]
            else:
                return [{'label': 'NEUTRAL', 'score': 0.5}]
        return fallback_analyzer

# Initialize sentiment analyzer
sentiment_analyzer = setup_sentiment_analyzer()

# Simple bias detection based on word frequencies
def detect_bias(texts, language, bias_words=None):
    # Default bias words (simplified example)
    if bias_words is None:
        if language == 'en':
            bias_words = {
                'gender': ['he', 'she', 'man', 'woman', 'boy', 'girl'],
                'race': ['black', 'white', 'asian', 'hispanic'],
                'age': ['young', 'old', 'elderly', 'teen']
            }
        elif language == 'de':
            bias_words = {
                'gender': ['er', 'sie', 'mann', 'frau', 'junge', 'mädchen'],
                'race': ['schwarz', 'weiß', 'asiatisch'],
                'age': ['jung', 'alt', 'ältere', 'jugendlich']
            }

    # Create a document-term matrix
    try:
        vectorizer = CountVectorizer(lowercase=True)
        dtm = vectorizer.fit_transform(texts)
        feature_names = vectorizer.get_feature_names_out()

        # Check for bias words
        bias_scores = {}
        for category, words in bias_words.items():
            category_score = 0
            for word in words:
                if word in feature_names:
                    word_idx = np.where(feature_names == word)[0]
                    if len(word_idx) > 0:
                        category_score += dtm[:, word_idx[0]].sum()
            bias_scores[category] = category_score
    except:
        # Fallback if vectorization fails
        bias_scores = {'gender': 0, 'race': 0, 'age': 0}

    return bias_scores

# Initialize translator
translator = Translator()

# Function to translate text
def translate_text(text, source_lang, target_lang):
    try:
        translation = translator.translate(text, src=source_lang, dest=target_lang)
        return translation.text
    except Exception as e:
        print(f"Translation error: {e}")
        return "Translation failed"

# Function for simple translation quality assessment
def assess_translation_quality(original, translation, target_lang):
    try:
        # 1. Length ratio (very simple metric)
        length_ratio = len(translation) / len(original) if len(original) > 0 else 0

        # 2. Entity preservation (check if entities are preserved)
        original_entities = extract_entities(original, 'en' if target_lang == 'de' else 'de')
        translation_entities = extract_entities(translation, target_lang)

        entity_preservation = len(translation_entities) / len(original_entities) if len(original_entities) > 0 else 1

        # 3. Sentiment preservation
        original_sentiment = analyze_sentiment_textblob(original, 'en' if target_lang == 'de' else 'de')
        translation_sentiment = analyze_sentiment_textblob(translation, target_lang)

        sentiment_diff = abs(original_sentiment - translation_sentiment)

        # Combine metrics (simple average)
        quality_score = (1 - abs(1 - length_ratio) + entity_preservation + (1 - sentiment_diff)) / 3

        return {
            'length_ratio': length_ratio,
            'entity_preservation': entity_preservation,
            'sentiment_difference': sentiment_diff,
            'overall_quality': quality_score
        }
    except:
        # Fallback if assessment fails
        return {
            'length_ratio': 0.5,
            'entity_preservation': 0.5,
            'sentiment_difference': 0.5,
            'overall_quality': 0.5
        }

# Main pipeline function
def cultural_analysis_pipeline(text):
    try:
        # Step 1: Language detection
        language = detect_language(text)
        print(f"Detected language: {language}")

        # Step 2: Preprocessing
        tokens = preprocess_text(text, language)
        print(f"Preprocessed tokens: {tokens[:10]}...")

        # Step 3: Sentiment analysis
        sentiment_tb = analyze_sentiment_textblob(text, language)
        sentiment_tf = sentiment_analyzer(text)[0]
        print(f"Sentiment (TextBlob): {sentiment_tb}")
        print(f"Sentiment (Transformer): {sentiment_tf}")

        # Step 4: Entity extraction and linking
        entities = link_entities(text, language)
        print(f"Entities: {entities}")

        # Step 5: Bias detection
        bias_scores = detect_bias([text], language)
        print(f"Bias scores: {bias_scores}")

        # Step 6: Translation (if needed)
        if language == 'en':
            translation = translate_text(text, 'en', 'de')
            target_lang = 'de'
        else:
            translation = translate_text(text, language, 'en')
            target_lang = 'en'

        print(f"Translation: {translation}")

        # Step 7: Translation quality assessment
        quality = assess_translation_quality(text, translation, target_lang)
        print(f"Translation quality: {quality}")

        # Return comprehensive results
        return {
            'language': language,
            'tokens': tokens,
            'sentiment': {
                'textblob': sentiment_tb,
                'transformer': sentiment_tf
            },
            'entities': entities,
            'bias': bias_scores,
            'translation': {
                'text': translation,
                'quality': quality
            }
        }
    except Exception as e:
        print(f"Error in pipeline: {e}")
        # Return a default result structure if the pipeline fails
        return {
            'language': 'unknown',
            'tokens': [],
            'sentiment': {
                'textblob': 0,
                'transformer': {'label': 'NEUTRAL', 'score': 0.5}
            },
            'entities': [],
            'bias': {'gender': 0, 'race': 0, 'age': 0},
            'translation': {
                'text': 'Translation failed',
                'quality': {
                    'length_ratio': 0,
                    'entity_preservation': 0,
                    'sentiment_difference': 0,
                    'overall_quality': 0
                }
            }
        }

# Create the web interface
def create_simple_ui():
    # Create input widgets
    text_input = widgets.Textarea(
        value='Enter text to analyze',
        placeholder='Type something',
        description='Text:',
        disabled=False,
        layout=widgets.Layout(width='100%', height='100px')
    )

    analyze_button = widgets.Button(
        description='Analyze Text',
        disabled=False,
        button_style='success',
        tooltip='Click to analyze',
        icon='check'
    )

    output = widgets.Output()

    # Define button click behavior
    def on_button_clicked(b):
        with output:
            output.clear_output()
            print("Analyzing text...")
            results = cultural_analysis_pipeline(text_input.value)

            # Display results
            print(f"Language: {results['language']}")
            print(f"Sentiment (TextBlob): {results['sentiment']['textblob']}")
            print(f"Sentiment (Transformer): {results['sentiment']['transformer']}")
            print(f"Entities: {results['entities']}")
            print(f"Bias scores: {results['bias']}")
            print(f"Translation: {results['translation']['text']}")
            print(f"Translation quality: {results['translation']['quality']}")

    analyze_button.on_click(on_button_clicked)

    # Combine widgets and display
    display(HTML('Cultural Analysis Tool'))
    display(text_input, analyze_button, output)

# Call the function to create and display the UI
create_simple_ui()

     
