<a href="https://colab.research.google.com/github/Anjali-K-S25/Philological-analysis-/blob/main/PatuAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
!pip install transformers torch scikit-learn



In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from transformers import MarianMTModel, MarianTokenizer, pipeline, BartForConditionalGeneration, BartTokenizer

In [None]:
# Synthetic dataset: Sanskrit and Proto-Dravidian sentences (10 each)
texts = [
    # Sanskrit sentences
    "रामः वनं गच्छति","अहम् पाठशालां गच्छामि", "सीता फलम् खादति",
    "शिवः तत्र स्थितः", "गुरुः पाठयति", "अहम् पठामि", "बालकः खेलति", "अहम् पठामि पुस्तकं", "रामस्य मित्रः आगच्छति", "सीता गीतं गायति",

    # Proto-Dravidian-inspired sentences (synthetic)
    "நான் பள்ளிக்குச் செல்வேன்", "அவன் ஓடுகிறான்", "நான் புத்தகம் படிக்கிறேன்",
    "அவள் பாடம் படிக்கிறாள்", "நாம் வீட்டிற்கு செல்கிறோம்", "அவர் பாடம் கற்றுக்கொள்கிறார்", "நான் ஓடுகிறேன்", "அவர் பாடம் படிக்கிறார்", "நான் உணவு சாப்பிடுகிறேன்", "அவன் விளையாடுகிறான்"
]

labels = [
    # Sanskrit labels
    "Sanskrit", "Sanskrit", "Sanskrit", "Sanskrit", "Sanskrit", "Sanskrit", "Sanskrit", "Sanskrit", "Sanskrit", "Sanskrit",

    # Proto-Dravidian labels
    "Proto-Dravidian", "Proto-Dravidian", "Proto-Dravidian", "Proto-Dravidian", "Proto-Dravidian", "Proto-Dravidian", "Proto-Dravidian", "Proto-Dravidian", "Proto-Dravidian", "Proto-Dravidian"
]

In [None]:
# Translation function (Sanskrit/Proto-Dravidian → English)
def translate_text(text, model_name="Helsinki-NLP/opus-mt-hi-en"):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    inputs = tokenizer([text], return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

In [None]:
# NLP analysis: NER + Sentiment
def analyze_text(text):
    nlp_ner = pipeline("ner", model="dslim/bert-base-NER")
    sentiment = pipeline("sentiment-analysis")
    entities = nlp_ner(text)
    sentiment_result = sentiment(text)
    return {"entities": entities, "sentiment": sentiment_result}

In [None]:
# Text restoration
def restore_text(partial_text, model_name="facebook/bart-base"):
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
    inputs = tokenizer([partial_text], return_tensors="pt", padding=True)
    outputs = model.generate(inputs["input_ids"], max_length=100, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Stylometry with multiple algorithms + epochs + cross-validation
def stylometry_multi_algo(texts, labels, test_text, epochs=50):
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    X = vectorizer.fit_transform(texts)
    y = labels
    X_test = vectorizer.transform([test_text])
    results = {}

    rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf_clf.fit(X, y)
    rf_pred = rf_clf.predict(X_test)[0]
    rf_acc = np.mean(cross_val_score(rf_clf, X, y, cv=5))
    results["RandomForest"] = {"prediction": rf_pred, "CV_accuracy": rf_acc}

    svm_clf = SVC(kernel="linear", probability=True)
    svm_clf.fit(X, y)
    svm_pred = svm_clf.predict(X_test)[0]
    svm_acc = np.mean(cross_val_score(svm_clf, X, y, cv=5))
    results["SVM"] = {"prediction": svm_pred, "CV_accuracy": svm_acc}

    log_clf = LogisticRegression(max_iter=epochs, solver='lbfgs', multi_class='auto')
    log_clf.fit(X, y)
    log_pred = log_clf.predict(X_test)[0]
    log_acc = np.mean(cross_val_score(log_clf, X, y, cv=5))
    results["LogisticRegression"] = {"prediction": log_pred, "CV_accuracy": log_acc}

    mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=epochs, random_state=42)
    mlp_clf.fit(X, y)
    mlp_pred = mlp_clf.predict(X_test)[0]
    mlp_acc = np.mean(cross_val_score(mlp_clf, X, y, cv=5))
    results["MLPClassifier"] = {"prediction": mlp_pred, "CV_accuracy": mlp_acc}

    return results

In [None]:
# Example test text (Sanskrit)
test_text = "रामः पाठशालां गच्छति"
# Translation
translated = translate_text(test_text)
print("Translated Text:", translated)
# NLP Analysis
analysis = analyze_text(translated)
print("Named Entities:", analysis["entities"])
print("Sentiment:", analysis["sentiment"])
# Stylometry predictions
results = stylometry_multi_algo(texts, labels, test_text, epochs=100)
for algo, info in results.items():
    print(f"{algo}: Prediction = {info['prediction']}, CV Accuracy = {round(info['CV_accuracy']*100,2)}%")
# Restoration example
damaged_text = "रामः ... गच्छति"
restored = restore_text(damaged_text)
print("Restored Text:", restored)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Translated Text: Ram: School's high


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


Named Entities: []
Sentiment: [{'label': 'NEGATIVE', 'score': 0.970949113368988}]




RandomForest: Prediction = Sanskrit, CV Accuracy = 60.0%
SVM: Prediction = Sanskrit, CV Accuracy = 85.0%
LogisticRegression: Prediction = Sanskrit, CV Accuracy = 70.0%
MLPClassifier: Prediction = Sanskrit, CV Accuracy = 85.0%


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Restored Text: रामः ... गच्छति
