In [5]:
import os
import re
import pandas as pd
from pathlib import Path
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

# ---------- CONFIG ----------
DB_FOLDER_NAME = "MarathiWN_1_3/database"
POS_FILES = {
    'adjective': 'adjective.txt',
    'noun': 'noun.txt',
    'verb': 'verb.txt',
    'adverb': 'adverb.txt'
}
DATA_FILE = 'data.txt'  # contains glosses

# POS mapping
POS_MAP = {'01':'n', '02':'v', '03':'a', '04':'r'}

# ---------- LOCATE DATABASE ----------
cwd = Path.cwd()
db_folder = cwd / DB_FOLDER_NAME
if not db_folder.exists():
    raise FileNotFoundError(f"Database folder not found: {db_folder}")

# ---------- PARSE DATA.TXT GLOSSES ----------
gloss_dict = {}  # key = synset_id, value = gloss
data_path = db_folder / DATA_FILE
with open(data_path, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith('#'): 
            continue
        # Split at first "|"
        parts = line.split('|', maxsplit=1)
        if len(parts) < 2: 
            continue
        left, gloss = parts
        left_parts = left.split()
        synset_id = left_parts[-1]  # last field
        gloss_dict[synset_id] = gloss.strip()

# ---------- PARSE POS FILES ----------
records = []

for pos_name, pos_file in POS_FILES.items():
    pos_path = db_folder / pos_file
    if not pos_path.exists(): 
        continue
    with open(pos_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'): 
                continue
            parts = line.split()
            if len(parts) < 2: 
                continue
            word = parts[0].lstrip('-')
            pos_code = parts[1]
            synset_ids = parts[4:]  # last column(s) = synset ids
            glosses = [gloss_dict.get(sid, '') for sid in synset_ids]
            
            # Map to English WordNet synset and SentiWordNet
            en_words, pos_scores, neg_scores, obj_scores = [], [], [], []
            for sid in synset_ids:
                try:
                    pos_char = POS_MAP.get(pos_code, None)
                    if pos_char is None:
                        continue
                    offset = int(sid)
                    syn = wn.synset_from_pos_and_offset(pos_char, offset)
                    en_words.append(syn.name())
                    swn_syn = swn.senti_synset(syn.name())
                    pos_scores.append(swn_syn.pos_score())
                    neg_scores.append(swn_syn.neg_score())
                    obj_scores.append(swn_syn.obj_score())
                except:
                    continue

            records.append({
                'marathi_word': word,
                'pos': pos_name,
                'synset_ids': ','.join(synset_ids),
                'gloss': ' || '.join(glosses),
                'english_synsets': ','.join(en_words),
                'positive': round(sum(pos_scores)/len(pos_scores), 3) if pos_scores else 0,
                'negative': round(sum(neg_scores)/len(neg_scores), 3) if neg_scores else 0,
                'objective': round(sum(obj_scores)/len(obj_scores), 3) if obj_scores else 0
            })

# ---------- SAVE CSV ----------
out_csv = Path.cwd() / 'marathi_sentiwordnet.csv'
pd.DataFrame(records).to_csv(out_csv, index=False, encoding='utf-8')
print(f"Saved structured Marathi SentiWordNet to: {out_csv}")


FileNotFoundError: [Errno 2] No such file or directory: "c:\\LLM's_for_SA\\MarathiWN_1_3\\database\\data.txt"