<a href="https://colab.research.google.com/github/FennecLadd/multilingual_content_generator/blob/main/multilingual_content_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install transformers nltk spacy fastapi uvicorn pyngrok tensorflow torch
!python -m spacy download en_core_web_sm

# Download necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')

# Create project directory structure
!mkdir -p models api utils data config

# Import commonly used libraries
import os
import numpy as np
import tensorflow as tf
import torch
from transformers import pipeline, MarianMTModel, MarianTokenizer
import spacy
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.1-py3-none-any.whl.metadata (6.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-c

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
%%writefile models/content_generator.py
from transformers import pipeline, set_seed
import torch

class ContentGenerator:
    def __init__(self, model_name="gpt2"):
        """Initialize the content generator with a pre-trained model"""
        self.generator = pipeline('text-generation', model=model_name)
        set_seed(42)  # For reproducibility

    def generate_content(self, prompt, max_length=250, num_return_sequences=1):
        """Generate content based on the given prompt"""
        try:
            generated_texts = self.generator(
                prompt,
                max_length=max_length,
                num_return_sequences=num_return_sequences,
                pad_token_id=50256
            )

            return [item['generated_text'] for item in generated_texts]
        except Exception as e:
            print(f"Error generating content: {str(e)}")
            return []

Overwriting models/content_generator.py


In [None]:
%%writefile models/local_translator.py
from transformers import MarianMTModel, MarianTokenizer
import torch
import os

class LocalTranslator:
    def __init__(self):
        """Initialize the local translator"""
        self.models = {}
        self.tokenizers = {}

        # Map of language codes to model names
        self.language_map = {
            "es": "en-es",
            "fr": "en-fr",
            "de": "en-de",
            "it": "en-it",
            "ru": "en-ru",
            "zh": "en-zh"
        }

    def _load_model_for_language(self, target_language):
        """Load the model for the specified language pair if not already loaded"""
        if target_language not in self.models:
            # Map language code to model name
            lang_pair = self.language_map.get(target_language, f"en-{target_language}")
            model_name = f"Helsinki-NLP/opus-mt-{lang_pair}"

            try:
                # Load tokenizer and model
                self.tokenizers[target_language] = MarianTokenizer.from_pretrained(model_name)
                self.models[target_language] = MarianMTModel.from_pretrained(model_name)

                # Move to GPU if available
                if torch.cuda.is_available():
                    self.models[target_language].to('cuda')
            except Exception as e:
                print(f"Error loading model for {target_language}: {str(e)}")
                # Fallback to a similar language or generic model
                if target_language not in ["es", "fr", "de"]:
                    print("Falling back to Spanish model")
                    return self._load_model_for_language("es")
                raise

        return self.models[target_language], self.tokenizers[target_language]

    def translate_text(self, text, source_language="en", target_language="es"):
        """
        Translate text to the target language

        Args:
            text: The text to translate
            source_language: The source language code (default: en)
            target_language: The target language code (e.g., 'es' for Spanish)

        Returns:
            The translated text
        """
        try:
            # Load the appropriate model
            model, tokenizer = self._load_model_for_language(target_language)

            # Prepare the text inputs
            inputs = tokenizer(text, return_tensors="pt", padding=True)

            # Move to GPU if available
            if torch.cuda.is_available():
                inputs = {k: v.to('cuda') for k, v in inputs.items()}

            # Generate translation
            output = model.generate(**inputs)

            # Decode the output
            translated = tokenizer.decode(output[0], skip_special_tokens=True)

            return translated
        except Exception as e:
            print(f"Error translating text: {str(e)}")
            return f"[Translation Error: {str(e)}]"

Overwriting models/local_translator.py


In [None]:
%%writefile models/sentiment.py
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer

class SentimentAnalyzer:
    def __init__(self, use_nltk=True):
        """
        Initialize the sentiment analyzer

        Args:
            use_nltk: If True, use NLTK's VADER for sentiment analysis instead of TensorFlow
        """
        self.use_nltk = use_nltk

        if use_nltk:
            self.sia = SentimentIntensityAnalyzer()
        else:
            self.max_words = 10000
            self.max_sequence_length = 200
            self.tokenizer = None
            self.model = None

    def prepare_data(self, texts, labels):
        """Prepare text data for training a TensorFlow model"""
        if self.use_nltk:
            print("Using NLTK - no data preparation needed")
            return None, None

        # Create and fit tokenizer
        self.tokenizer = Tokenizer(num_words=self.max_words)
        self.tokenizer.fit_on_texts(texts)

        # Convert texts to sequences
        sequences = self.tokenizer.texts_to_sequences(texts)

        # Pad sequences
        padded_sequences = pad_sequences(sequences, maxlen=self.max_sequence_length)

        return padded_sequences, np.array(labels)

    def build_model(self):
        """Build the TensorFlow sentiment analysis model"""
        if self.use_nltk:
            print("Using NLTK - no model building needed")
            return None

        self.model = tf.keras.Sequential([
            tf.keras.layers.Embedding(self.max_words, 128, input_length=self.max_sequence_length),
            tf.keras.layers.SpatialDropout1D(0.2),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])

        self.model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])

        return self.model

    def train(self, X_train, y_train, epochs=5, batch_size=64, validation_split=0.2):
        """Train the TensorFlow sentiment analysis model"""
        if self.use_nltk:
            print("Using NLTK - no training needed")
            return None

        if self.model is None:
            self.build_model()

        history = self.model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=validation_split
        )

        return history

    def predict_sentiment(self, text):
        """
        Predict sentiment for the given text

        Args:
            text: Text to analyze

        Returns:
            Sentiment score (0-1 where higher is more positive)
        """
        if self.use_nltk:
            # Use NLTK's VADER
            scores = self.sia.polarity_scores(text)
            # Convert VADER's compound score (-1 to 1) to 0-1 scale
            return (scores['compound'] + 1) / 2

        else:
            # Use TensorFlow model
            if self.model is None or self.tokenizer is None:
                raise ValueError("Model not trained. Call train first or use NLTK.")

            # Preprocess the text
            sequence = self.tokenizer.texts_to_sequences([text])
            padded = pad_sequences(sequence, maxlen=self.max_sequence_length)

            # Make prediction
            prediction = self.model.predict(padded)[0][0]

            return float(prediction)

Overwriting models/sentiment.py


In [None]:
%%writefile utils/nlp_utils.py
import spacy
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

class EntityExtractor:
    def __init__(self, model="en_core_web_sm"):
        """Initialize spaCy for entity extraction"""
        self.nlp = spacy.load(model)

    def extract_entities(self, text):
        """
        Extract named entities from text

        Args:
            text: Text to analyze

        Returns:
            List of entities with type and other metadata
        """
        doc = self.nlp(text)

        entities = []
        for ent in doc.ents:
            entity = {
                "name": ent.text,
                "type": ent.label_,
                "start_char": ent.start_char,
                "end_char": ent.end_char,
                "description": spacy.explain(ent.label_)
            }
            entities.append(entity)

        return entities

    def extract_keywords(self, text, top_n=5):
        """
        Extract keywords from text using noun phrases and frequency

        Args:
            text: Text to analyze
            top_n: Number of keywords to return

        Returns:
            List of top keywords
        """
        doc = self.nlp(text)

        # Extract noun chunks and named entities
        keywords = {}

        # Add noun chunks (noun phrases)
        for chunk in doc.noun_chunks:
            if chunk.text.lower() not in keywords:
                keywords[chunk.text.lower()] = 1
            else:
                keywords[chunk.text.lower()] += 1

        # Add named entities
        for ent in doc.ents:
            if ent.text.lower() not in keywords:
                keywords[ent.text.lower()] = 2  # Give entities slightly higher weight
            else:
                keywords[ent.text.lower()] += 2

        # Sort by frequency and return top N
        sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)
        return [item[0] for item in sorted_keywords[:top_n]]

Overwriting utils/nlp_utils.py


In [None]:
%%writefile app.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uvicorn

# Import our project components
from models.content_generator import ContentGenerator
from models.local_translator import LocalTranslator
from models.sentiment import SentimentAnalyzer
from utils.nlp_utils import EntityExtractor

# Create FastAPI app
app = FastAPI(title="Multilingual Content Generator")

# Initialize components
content_generator = ContentGenerator()
translator = LocalTranslator()
sentiment_analyzer = SentimentAnalyzer(use_nltk=True)
entity_extractor = EntityExtractor()

# Define request/response models
class ContentRequest(BaseModel):
    prompt: str
    max_length: int = 250
    target_languages: List[str] = ["es", "fr", "de"]
    analyze_sentiment: bool = True
    enhance_content: bool = False

class TranslationResult(BaseModel):
    language: str
    translated_text: str
    sentiment_score: Optional[float] = None

class ContentResponse(BaseModel):
    original_content: str
    translations: List[TranslationResult]
    entities: Optional[List[dict]] = None

@app.post("/generate", response_model=ContentResponse)
async def generate_content(request: ContentRequest):
    """Generate content based on prompt and translate to target languages"""
    try:
        # Generate content
        generated_texts = content_generator.generate_content(
            request.prompt,
            max_length=request.max_length
        )

        if not generated_texts:
            raise HTTPException(status_code=500, detail="Failed to generate content")

        original_content = generated_texts[0]

        # Process translations
        translations = []
        for lang in request.target_languages:
            translated_text = translator.translate_text(original_content, target_language=lang)

            result = TranslationResult(
                language=lang,
                translated_text=translated_text
            )

            # Add sentiment analysis if requested
            if request.analyze_sentiment:
                sentiment_score = sentiment_analyzer.predict_sentiment(translated_text)
                result.sentiment_score = sentiment_score

            translations.append(result)

        # Process entities if requested
        entities = None
        if request.enhance_content:
            entities = entity_extractor.extract_entities(original_content)

        return ContentResponse(
            original_content=original_content,
            translations=translations,
            entities=entities
        )

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    uvicorn.run("app:app", host="0.0.0.0", port=6060, reload=True)

Overwriting app.py


In [None]:
# Test the content generator
from models.content_generator import ContentGenerator

generator = ContentGenerator()
print("Testing content generator...")
content = generator.generate_content("Write a short paragraph about machine learning")[0]
print(f"Generated content: {content}\n")

# Test the translator
from models.local_translator import LocalTranslator

translator = LocalTranslator()
print("Testing translator...")
spanish = translator.translate_text(content, target_language="es")
print(f"Spanish translation: {spanish}\n")

# Test sentiment analysis
from models.sentiment import SentimentAnalyzer

analyzer = SentimentAnalyzer(use_nltk=True)
print("Testing sentiment analysis...")
sentiment = analyzer.predict_sentiment(content)
print(f"Sentiment score: {sentiment}\n")

# Test entity extraction
from utils.nlp_utils import EntityExtractor

extractor = EntityExtractor()
print("Testing entity extraction...")
entities = extractor.extract_entities(content)
print(f"Entities: {entities}")

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Testing content generator...
Generated content: Write a short paragraph about machine learning:

This is an extremely important question—how can we go from a system based on a single, "one-to-many" approach to a whole system based on multiple, "tens of thousands" approaches? Consider the following examples. Let's assume that there are no such thing as single, two- to three-dimensional structures: this will help with both learning and training. Each of them will be able to generate thousands or even millions of learning objects and can perform thousands of repetitions of training in just a single session.

Imagine there were an artificial intelligence platform based on two types of information processing models: one (an "all-purpose" training product) and one (one-to-many) training product. This system could be modeled as an input model from another training platform, as well as an input model as an input to a model, but the model would need some sort of ability to learn the specific pa



Spanish translation: Consideremos un breve párrafo sobre el aprendizaje automático: Esta es una pregunta extremadamente importante: ¿cómo podemos pasar de un sistema basado en un enfoque único, "uno a muchos" a un sistema completo basado en enfoques múltiples, "decenas de miles"? Consideremos los siguientes ejemplos. Supongamos que no hay tal cosa como estructuras únicas, de dos a tres dimensiones: esto ayudará con el aprendizaje y la formación. Cada uno de ellos será capaz de generar miles o incluso millones de objetos de aprendizaje y puede realizar miles de repeticiones de formación en una sola sesión. Imaginemos que hubo una plataforma de inteligencia artificial basada en dos tipos de modelos de procesamiento de información: uno (un producto de formación "todo" y uno (uno a muchos) producto de formación. Este sistema podría modelarse como un modelo de entrada de otra plataforma de formación, así como un modelo de entrada como una aportación a un modelo, pero el modelo necesitaría a

In [None]:
from pyngrok import ngrok
ngrok.set_auth_token("2vLmosBsMmZiJMYnQMfb36X5LDw_5t2EqCfLt58aqX2YGVtNT")


In [None]:
# Import necessary modules
from pyngrok import ngrok
import nest_asyncio
import uvicorn
import threading
import time

# Apply nest_asyncio to make asyncio work in Colab
nest_asyncio.apply()

# Start the FastAPI app in a separate thread
def run_app():
    uvicorn.run("app:app", host="0.0.0.0", port=6060)

# Start the thread
thread = threading.Thread(target=run_app, daemon=True)
thread.start()

# Give the server a moment to start
time.sleep(2)

# Create a tunnel
public_url = ngrok.connect(6060)
print(f"FastAPI app is running at: {public_url}")

print(f"You can access the API documentation at {public_url}/docs")


INFO:     Started server process [13095]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:6060 (Press CTRL+C to quit)


FastAPI app is running at: NgrokTunnel: "https://f441-35-229-240-69.ngrok-free.app" -> "http://localhost:6060"
You can access the API documentation at NgrokTunnel: "https://f441-35-229-240-69.ngrok-free.app" -> "http://localhost:6060"/docs
