In [None]:
import pandas as pd
from pathlib import Path
import torch
from transformers import MarianMTModel, MarianTokenizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
import math

# ---------- CONFIG ----------
lexicon_csv = Path.cwd() / 'marathi_lexicon_correct_pos.csv'
output_csv = Path.cwd() / 'marathi_sentiwordnet_mariante.csv'
batch_size = 32

# ---------- LOAD LEXICON ----------
df = pd.read_csv(lexicon_csv)
print(f"Loaded {len(df)} Marathi words")

# ---------- LOAD MARIANMT TRANSLATOR ----------
model_name = "Helsinki-NLP/opus-mt-mr-en"  # Marathi → English
tokenizer = MarianTokenizer.from_pretrained(model_name)
# Try to load weights using safetensors (avoids torch.load safety restriction on older torch)
try:
    model = MarianMTModel.from_pretrained(model_name, use_safetensors=True)
    print("Loaded model weights using safetensors.")
except Exception as e:
    print("Warning: failed to load weights with safetensors; falling back to default loader.")
    print("If you get a security-related ValueError: upgrade torch to >=2.6 or install safetensors and ensure the model provides safetensors weights.")
    model = MarianMTModel.from_pretrained(model_name)

# use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# ---------- TRANSLATE IN BATCHES (prints 'Batch X complete' and saves after each batch) ----------
marathi_words = df['marathi_word'].astype(str).tolist()
num_batches = math.ceil(len(marathi_words) / batch_size)
# initialize english list with empty strings so we can assign by slice
english_words = [''] * len(marathi_words)

for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(marathi_words))
    batch = marathi_words[start:end]

    # tokenize and move to device
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        try:
            generated = model.generate(**inputs, max_new_tokens=64, num_beams=1, early_stopping=True)
            outputs = tokenizer.batch_decode(generated, skip_special_tokens=True)
        except Exception as e:
            # fallback: generate per item if batch generation fails
            outputs = []
            for w in batch:
                enc = tokenizer([w], return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
                gen = model.generate(**enc, max_new_tokens=64, num_beams=1)
                txt = tokenizer.batch_decode(gen, skip_special_tokens=True)[0]
                outputs.append(txt)

    # assign outputs into the full list
    english_words[start:end] = outputs

    # save progress after batch
    try:
        df_temp = df.copy()
        df_temp['english_word'] = english_words
        df_temp.to_csv(output_csv, index=False, encoding='utf-8')
    except Exception as e:
        print(f"Warning: failed to write partial output after batch {i+1}: {e}")

    print(f"Batch {i+1}/{num_batches} complete — translated {end} / {len(marathi_words)} words")

# Final assignment to dataframe
df['english_word'] = english_words

# ---------- GET SENTIWORDNET SCORES ----------
pos_map = {'noun':'n', 'adjective':'a', 'verb':'v', 'adverb':'r'}
pos_scores, neg_scores, obj_scores = [], [], []

for idx, row in df.iterrows():
    word = row['english_word']
    pos = pos_map.get(row['pos'], None)
    if not pos:
        pos_scores.append(0)
        neg_scores.append(0)
        obj_scores.append(0)
        continue
    
    # get synsets
    synsets = wn.synsets(word, pos=pos)
    if not synsets:
        pos_scores.append(0)
        neg_scores.append(0)
        obj_scores.append(0)
        continue
    
    # average sentiwordnet scores
    pos_sum = neg_sum = obj_sum = 0
    valid_count = 0
    for syn in synsets:
        try:
            swn_syn = swn.senti_synset(syn.name())
            pos_sum += swn_syn.pos_score()
            neg_sum += swn_syn.neg_score()
            obj_sum += swn_syn.obj_score()
            valid_count += 1
        except:
            continue
    
    if valid_count > 0:
        pos_scores.append(round(pos_sum/valid_count, 3))
        neg_scores.append(round(neg_sum/valid_count, 3))
        obj_scores.append(round(obj_sum/valid_count, 3))
    else:
        pos_scores.append(0)
        neg_scores.append(0)
        obj_scores.append(0)

df['positive'] = pos_scores
df['negative'] = neg_scores
df['objective'] = obj_scores

# ---------- SAVE CSV ----------
df.to_csv(output_csv, index=False, encoding='utf-8')
print(f"✅ Saved Marathi SentiWordNet with MarianMT translation to: {output_csv}")
print(df.head(20))


Loaded 52325 Marathi words




Loaded model weights using safetensors.


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Batch 1/1636 complete — translated 32 / 52325 words
Batch 2/1636 complete — translated 64 / 52325 words
Batch 2/1636 complete — translated 64 / 52325 words
Batch 3/1636 complete — translated 96 / 52325 words
Batch 3/1636 complete — translated 96 / 52325 words
Batch 4/1636 complete — translated 128 / 52325 words
Batch 4/1636 complete — translated 128 / 52325 words
Batch 5/1636 complete — translated 160 / 52325 words
Batch 5/1636 complete — translated 160 / 52325 words
Batch 6/1636 complete — translated 192 / 52325 words
Batch 6/1636 complete — translated 192 / 52325 words
Batch 7/1636 complete — translated 224 / 52325 words
Batch 7/1636 complete — translated 224 / 52325 words
Batch 8/1636 complete — translated 256 / 52325 words
Batch 8/1636 complete — translated 256 / 52325 words
Batch 9/1636 complete — translated 288 / 52325 words
Batch 9/1636 complete — translated 288 / 52325 words
Batch 10/1636 complete — translated 320 / 52325 words
Batch 10/1636 complete — translated 320 / 52325 wo

In [10]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")


True
NVIDIA GeForce RTX 4050 Laptop GPU


In [20]:
import pandas as pd
from pathlib import Path
from googletrans import Translator
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
import time
import math

# ---------- CONFIG ----------
lexicon_csv = Path.cwd() / 'marathi_lexicon_correct_pos.csv'
output_csv = Path.cwd() / 'marathi_sentiwordnet_google.csv'
batch_size = 500  # batch size for translate()

# ---------- LOAD LEXICON ----------
df = pd.read_csv(lexicon_csv)
print(f"Loaded {len(df)} Marathi words")

# ---------- SETUP TRANSLATOR ----------
translator = Translator()
lang = 'en'

# ---------- TRANSLATE IN BATCHES ----------
marathi_words = df['marathi_word'].astype(str).tolist()
english_out = [''] * len(marathi_words)

num_batches = math.ceil(len(marathi_words) / batch_size)
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(marathi_words))
    batch = marathi_words[start:end]

    try:
        res = translator.translate(batch, src='mr', dest=lang)
        for j, tr in enumerate(res):
            english_out[start + j] = tr.text.lower()
    except Exception as e:
        print(f"Batch {i+1} failed, retrying individually...")
        for j, w in enumerate(batch):
            try:
                english_out[start + j] = translator.translate(w, src='mr', dest=lang).text.lower()
            except:
                english_out[start + j] = ''
            time.sleep(0.05)

    print(f"Translated batch {i+1}/{num_batches}")
    df.loc[start:end-1, 'english_word'] = english_out[start:end]
    df.to_csv(output_csv, index=False, encoding='utf-8')

df['english_word'] = english_out

# ---------- GET SENTIWORDNET SCORES ----------
pos_map = {'noun':'n', 'adjective':'a', 'verb':'v', 'adverb':'r'}
pos_scores = []
neg_scores = []
obj_scores = []

for _, row in df.iterrows():
    w_en = row['english_word']
    pos_tag = pos_map.get(row['pos'], None)
    if not pos_tag:
        pos_scores.append(0)
        neg_scores.append(0)
        obj_scores.append(0)
        continue

    syns = wn.synsets(w_en, pos=pos_tag)
    if not syns:
        pos_scores.append(0)
        neg_scores.append(0)
        obj_scores.append(0)
        continue

    ps, ns, os = 0, 0, 0
    count = 0
    for s in syns:
        try:
            sn = swn.senti_synset(s.name())
            ps += sn.pos_score()
            ns += sn.neg_score()
            os += sn.obj_score()
            count += 1
        except:
            pass

    if count:
        pos_scores.append(round(ps/count, 3))
        neg_scores.append(round(ns/count, 3))
        obj_scores.append(round(os/count, 3))
    else:
        pos_scores.append(0)
        neg_scores.append(0)
        obj_scores.append(0)

df['positive'] = pos_scores
df['negative'] = neg_scores
df['objective'] = obj_scores

# ---------- SAVE FINAL CSV ----------
df.to_csv(output_csv, index=False, encoding='utf-8')
print(f"✅ Saved final lexicon: {output_csv}")
print(df.head())


Loaded 52325 Marathi words
Batch 1 failed, retrying individually...
Batch 1 failed, retrying individually...


  english_out[start + j] = ''


Translated batch 1/105


  res = translator.translate(batch, src='mr', dest=lang)
  english_out[start + j] = ''


Batch 2 failed, retrying individually...
Translated batch 2/105
Translated batch 2/105


  res = translator.translate(batch, src='mr', dest=lang)
  english_out[start + j] = ''


Batch 3 failed, retrying individually...


KeyboardInterrupt: 