In [None]:
!pip install transformers textblob sentencepiece



In [None]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


###Sentiment Analysis

In [None]:
from textblob import TextBlob

def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Test it
text = "Ilove this product! It's boring."
result = analyze_sentiment(text)
print("Sentiment:", result)

Sentiment: Negative


###Text Classification

In [None]:
from transformers import pipeline

# Load classifier
classifier = pipeline("text-classification", model="textattack/distilbert-base-uncased-ag-news")

# Define label map for better readability
label_map = {
    "LABEL_0": "World",
    "LABEL_1": "Sports",
    "LABEL_2": "Business",
    "LABEL_3": "Sci/Tech"
}

def classify_genre(text):
    result = classifier(text)[0]
    label_code = result['label']
    label = label_map.get(label_code, "Unknown")
    score = round(result['score'], 2)
    return f"Predicted Genre: {label}, Confidence: {score}"

# Test
text = "The government passed a new law today"
print(classify_genre(text))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


Predicted Genre: World, Confidence: 0.77


###Text Summarization

In [1]:
from transformers import pipeline

# Load summarization pipeline with a longer max token limit
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text(text):
    # Break text into chunks if needed (BART handles ~1024 tokens max)
    max_chunk = 800
    text = text.strip().replace("\n", " ")
    sentences = text.split('. ')
    current_chunk = ''
    chunks = []

    for sentence in sentences:
        if len(current_chunk) + len(sentence.split(' ')) <= max_chunk:
            current_chunk += sentence + '. '
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + '. '
    chunks.append(current_chunk.strip())

    # Summarize each chunk
    summary = []
    for chunk in chunks:
        part = summarizer(chunk, max_length=180, min_length=60, do_sample=False)
        summary.append(part[0]['summary_text'])

    return ' '.join(summary)

# Test example
long_text = """
India's space agency ISRO successfully launched the Chandrayaan-3 mission today, aiming to explore the Moon’s south pole.
The mission includes a lander and a rover, with the goal of gathering data about the lunar surface and testing key technologies.
This marks a major milestone in India’s space program, following the partial success of Chandrayaan-2.
"""

print("Summary:")
print(summarize_text(long_text))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 180, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


Summary:
Chandrayaan-3 mission aims to explore the Moon’s south pole. Mission includes a lander and a rover, with the goal of gathering data about the lunar surface and testing key technologies. This marks a major milestone in India's space program, following the partial success of Chandraysaan-2.


### Text Translation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load model and tokenizer
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define the supported languages and codes
LANGUAGES = {
    "english": "eng_Latn",
    "hindi": "hin_Deva",
    "tamil": "tam_Taml",
    "telugu": "tel_Telu",
    "malayalam": "mal_Mlym",
    "kannada": "kan_Knda",
    "japanese": "jpn_Jpan",
    "spanish": "spa_Latn",
    "french": "fra_Latn"
}

# Translation function
def translate_text(text, source_lang, target_lang):
    if source_lang not in LANGUAGES or target_lang not in LANGUAGES:
        return "Unsupported language"

    src_code = LANGUAGES[source_lang]
    tgt_code = LANGUAGES[target_lang]

    tokenizer.src_lang = src_code
    encoded = tokenizer(text, return_tensors="pt")
    target_lang_token_id = tokenizer.convert_tokens_to_ids(tgt_code)
    generated_tokens = model.generate(**encoded,forced_bos_token_id=target_lang_token_id)
    output = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return output[0]

# ✅ TEST CASES
print("🔁 english ➡ hindi")
print("✅ Translated Text:", translate_text("India is a beautiful country", "english", "hindi"))

print("\n🔁 hindi ➡ french")
print("✅ Translated Text:", translate_text("भारत एक सुंदर देश है", "hindi", "french"))

print("\n🔁 spanish ➡ french")
print("✅ Translated Text:", translate_text("España es un país hermoso", "spanish", "french"))

print("\n🔁 japanese ➡ english")
print("✅ Translated Text:", translate_text("これは日本語の文です", "japanese", "english"))

print("\n🔁 english ➡ tamil")
print("✅ Translated Text:", translate_text("India is a beautiful country", "english", "tamil"))


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

🔁 english ➡ hindi
✅ Translated Text: भारत एक सुंदर देश है

🔁 hindi ➡ french
✅ Translated Text: L'Inde est un pays magnifique.

🔁 spanish ➡ french
✅ Translated Text: L'Espagne est un pays magnifique

🔁 japanese ➡ english
✅ Translated Text: This is a Japanese sentence.

🔁 english ➡ tamil
✅ Translated Text: இந்தியா ஒரு அழகான நாடு
