<a href="https://colab.research.google.com/github/Anjali-K-S25/Philological-analysis-/blob/main/philological_analysis_tool_using_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio pandas numpy scikit-learn tensorflow openpyxl matplotlib



In [2]:
import gradio as gr
import pandas as pd
import numpy as np
import os
import uuid
import io
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from collections import Counter

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
from google.colab import files
uploaded = files.upload()

DATA_PATH = list(uploaded.keys())[0]
print("Uploaded:", DATA_PATH)

Saving Philological_7525.xlsx to Philological_7525 (2).xlsx
Uploaded: Philological_7525 (2).xlsx


In [4]:
df = pd.read_excel(DATA_PATH)

required_cols = [
    'original_text', 'language', 'corrupted_text',
    'restored_text', 'english_meaning'
]

missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Dataset missing required columns: {missing}")

df = df.dropna(subset=required_cols).reset_index(drop=True)

optional_cols = ['author', 'year', 'topic', 'transliteration', 'confidence']
for c in optional_cols:
    if c not in df.columns:
        df[c] = np.nan

df.head()


Unnamed: 0,id,language,original_text,corrupted_text,restored_text,transliteration,author,year,topic,english_meaning,confidence
0,1,sanskrit,राजा धर्मं पालयति,राजए धर्फं पालयति,राजा धर्मं पालयति,rājā dharmaṃ pālayati,Kautilya,1300,administration,King upholds dharma,
1,2,sanskrit,रामो वनं गच्छति,बामो वषं गच्छति,रामो वनं गच्छति,rāmo vanaṃ gacchati,Kalidasa,1500,literature,Rama goes to the forest,
2,3,proto_dravidian,puḷḷi maṇi,puḷḷए गaबi,puḷḷi maṇi,puḷḷi maṇi,Ancient Malayalam,1050,education,The bell rings,
3,4,sanskrit,रामो वनं गच्छति,तामो वनं गएउछति,रामो वनं गच्छति,rāmo vanaṃ gacchati,Kalidasa,1500,literature,Rama goes to the forest,
4,5,sanskrit,विद्या मूलं धनं,एिद्या मूलं धनं,विद्या मूलं धनं,vidyā mūlaṃ dhanaṃ,Panini,1200,philosophy,Knowledge is the root of wealth,


In [5]:
# Language classifier
vec1 = CountVectorizer(max_features=5000)
X_lang_vec = vec1.fit_transform(df['original_text'])
le_lang = LabelEncoder()
y_lang_enc = le_lang.fit_transform(df['language'])

X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_lang_vec, y_lang_enc, test_size=0.2, random_state=42
)

lang_clf = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300, random_state=42)
lang_clf.fit(X_train1, y_train1)


In [6]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['corrupted_text'])

seq_X = tokenizer.texts_to_sequences(df['corrupted_text'])
padded_X = pad_sequences(seq_X, maxlen=50)

seq_y = tokenizer.texts_to_sequences(df['restored_text'])
padded_y = pad_sequences(seq_y, maxlen=50)

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    padded_X, padded_y, test_size=0.2, random_state=42
)

rnn_model = Sequential([
    Embedding(5000, 64, input_length=50),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dense(5000, activation='softmax')
])

rnn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
rnn_model.fit(X_train2, np.expand_dims(y_train2[:,0], -1), epochs=3, batch_size=32)


Epoch 1/3




[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 160ms/step - loss: 3.4683
Epoch 2/3
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 156ms/step - loss: 1.2601e-04
Epoch 3/3
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 160ms/step - loss: 1.0740e-04


<keras.src.callbacks.history.History at 0x7e27f1ae2180>

In [7]:
vec3 = CountVectorizer(max_features=5000)
X_mean_vec = vec3.fit_transform(df['restored_text'])

le_mean = LabelEncoder()
y_mean_enc = le_mean.fit_transform(df['english_meaning'])

X_train3, X_test3, y_train3, y_test3 = train_test_split(
    X_mean_vec, y_mean_enc, test_size=0.2, random_state=42
)

mean_clf = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300, random_state=42)
mean_clf.fit(X_train3, y_train3)


In [8]:
# ---- Cell 8: Dataset Explorer Utility Functions ----

def get_unique_sorted(col):
    vals = df[col].dropna().astype(str).unique().tolist()
    vals = sorted(vals)
    return ["-- Any --"] + vals

LANG_OPTIONS = get_unique_sorted('language')
TOPIC_OPTIONS = get_unique_sorted('topic')
AUTHOR_OPTIONS = get_unique_sorted('author')

def filter_and_page(search, language, topic, author, year_min, year_max, sort_by, sort_dir, page, page_size):
    d = df.copy()

    # text search
    if search and str(search).strip():
        s = str(search).lower()
        mask = (
            d['original_text'].str.lower().str.contains(s, na=False) |
            d['corrupted_text'].str.lower().str.contains(s, na=False) |
            d['restored_text'].str.lower().str.contains(s, na=False) |
            d['english_meaning'].str.lower().str.contains(s, na=False)
        )
        d = d.loc[mask]

    # dropdown filters
    if language != "-- Any --":
        d = d.loc[d['language'].astype(str) == language]
    if topic != "-- Any --":
        d = d.loc[d['topic'].astype(str) == topic]
    if author != "-- Any --":
        d = d.loc[d['author'].astype(str) == author]

    # year filtering
    d['year'] = pd.to_numeric(d['year'], errors='coerce')
    if year_min is not None:
        d = d.loc[(d['year'].isna()) | (d['year'] >= float(year_min))]
    if year_max is not None:
        d = d.loc[(d['year'].isna()) | (d['year'] <= float(year_max))]

    # sorting
    if sort_by and sort_by in d.columns:
        asc = True if sort_dir == "Ascending" else False
        d = d.sort_values(by=sort_by, ascending=asc)

    total = len(d)
    total_pages = max(1, int(np.ceil(total / page_size)))
    page = max(1, min(page, total_pages))

    start = (page - 1) * page_size
    end = start + page_size

    page_df = d.iloc[start:end].reset_index(drop=True)
    return page_df, total, total_pages


In [9]:
# ---- Cell 9: Export + Stats Functions ----

def export_to_csv(selected_indices, current_filtered_df):
    if selected_indices is None or len(selected_indices) == 0:
        export_df = current_filtered_df.copy()
    else:
        export_df = current_filtered_df.iloc[selected_indices].copy()

    fname = f"/tmp/export_{uuid.uuid4().hex}.csv"
    export_df.to_csv(fname, index=False)
    return fname

def compute_dataset_stats(filtered_df):
    stats = {
        "total_rows": len(df),
        "filtered_rows": len(filtered_df)
    }

    lang_counts = filtered_df['language'].fillna("Unknown").value_counts().to_dict()
    stats['language_counts'] = lang_counts

    years = pd.to_numeric(filtered_df['year'], errors='coerce').dropna()
    if len(years) > 0:
        stats['year_min'] = int(years.min())
        stats['year_max'] = int(years.max())
        stats['year_mean'] = round(float(years.mean()), 2)
    else:
        stats['year_min'] = stats['year_max'] = stats['year_mean'] = None

    return stats

from collections import Counter

def top_word_freq(series, top_n=25):
    text = " ".join(series.dropna().astype(str).tolist()).lower()
    tokens = [t for t in text.split() if len(t) > 1]
    counter = Counter(tokens)
    data = counter.most_common(top_n)
    if not data:
        return [], []

    words, counts = zip(*data)
    return list(words), list(counts)


In [10]:
# ---- Cell 10: Plotting Functions ----

import matplotlib.pyplot as plt

def plot_language_distribution(filtered_df):
    counts = filtered_df['language'].fillna("Unknown").value_counts()
    fig, ax = plt.subplots(figsize=(6,4))
    counts.plot(kind="bar", ax=ax)
    ax.set_title("Language Distribution")
    ax.set_xlabel("Language")
    ax.set_ylabel("Count")
    plt.tight_layout()
    return fig

def plot_year_histogram(filtered_df):
    years = pd.to_numeric(filtered_df['year'], errors='coerce').dropna()
    fig, ax = plt.subplots(figsize=(6,4))
    if len(years) > 0:
        ax.hist(years, bins=20)
    else:
        ax.text(0.5, 0.5, "No year data", ha="center")
    ax.set_title("Year Distribution")
    ax.set_xlabel("Year")
    ax.set_ylabel("Count")
    plt.tight_layout()
    return fig

def plot_wordfreq(words, counts, title="Top Words"):
    fig, ax = plt.subplots(figsize=(6,6))
    ax.barh(words[::-1], counts[::-1])
    ax.set_title(title)
    ax.set_xlabel("Frequency")
    plt.tight_layout()
    return fig


In [12]:
custom_theme = gr.themes.Base(
    primary_hue="purple",
    secondary_hue="blue",
    neutral_hue="gray",
).set(
    body_background_fill="linear-gradient(135deg, #1d1f29, #343746)",
    body_text_color="#ffffff",
    block_background_fill="#2b2d3c",
    block_shadow="0px 0px 20px rgba(0,0,0,0.4)",
    button_primary_background_fill="#7d4bff",
    button_primary_text_color="white"
)


In [None]:
import gradio as gr

# Use the same custom_theme you defined earlier
# (or redefine if not in scope)
custom_theme = gr.themes.Base(
    primary_hue="purple",
    secondary_hue="blue",
    neutral_hue="gray",
).set(
    body_background_fill="linear-gradient(135deg, #1d1f29, #343746)",
    body_text_color="#ffffff",
    block_background_fill="#2b2d3c",
    block_shadow="0px 0px 20px rgba(0,0,0,0.4)",
    button_primary_background_fill="#7d4bff",
    button_primary_text_color="white"
)

# -------------------------
# Gradio UI
# -------------------------
with gr.Blocks(theme=custom_theme, css="body {font-family: 'Poppins', sans-serif;}") as demo:

    gr.Markdown("<h1 style='text-align:center;'>📘 Philological AI Ensemble — Pro Dataset Explorer</h1>")

    with gr.Tabs():
        with gr.Tab("📘 Instructions"):
            gr.Markdown("Instructions here...")

        with gr.Tab("🈴 Translation"):
            gr.Markdown("Translation UI goes here...")

        with gr.Tab("📂 File Evaluation"):
            gr.Markdown("File evaluation UI goes here...")

        with gr.Tab("📊 Dataset Browser (Explorer)"):
            gr.Markdown("Dataset browser UI goes here...")

# ---------------------------------
# LAUNCH the app
# ---------------------------------
demo.launch(debug=True)  # <-- Must be OUTSIDE the 'with' block


  with gr.Blocks(theme=custom_theme, css="body {font-family: 'Poppins', sans-serif;}") as demo:
  with gr.Blocks(theme=custom_theme, css="body {font-family: 'Poppins', sans-serif;}") as demo:


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://123aae626bb3cd6b4c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
