In [None]:
!pip install numpy scipy gensim googletrans fitz pypdf pandas pyldavis nltk spacy

In [None]:
!pip install --upgrade googletrans

In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
import random, numpy as np
import pandas as pd
import ast

# 1. Seed all RNGs
random.seed(42)
np.random.seed(42)

In [None]:
df_translated_documents = pd.read_csv("/kaggle/input/final-docs/final_docs_english (1).csv")
df_params = pd.read_csv("/kaggle/input/train-results/results.csv")

In [None]:
best_params_index = 69
best_params = df_params.iloc[best_params_index]
best_params

In [None]:
best_params['num_topics'].dtype
best_params['passes'].dtype
best_params['iterations'].dtype
best_params['alpha'].dtype
best_params['eta'].dtype
best_params['coherence'].dtype
best_params['perplexity'].dtype

In [None]:
df_translated_documents['tokens'] = df_translated_documents['tokens'].apply(ast.literal_eval)

In [None]:
from sklearn.model_selection import train_test_split
# 4. Build Gensim dictionary and corpus
dictionary = gensim.corpora.Dictionary(df_translated_documents['tokens'])
dictionary.filter_extremes(no_below=2, no_above=0.5)
corpus = [dictionary.doc2bow(tokens) for tokens in df_translated_documents['tokens']]

# 5. Split corpus for held-out perplexity
train_corpus, heldout_corpus = train_test_split(corpus, test_size=0.05, random_state=42)

model = LdaModel(
    corpus=train_corpus,
    id2word=dictionary,
    num_topics   = best_params['num_topics'],
    passes       = best_params['passes'],
    iterations   = best_params['iterations'],
    alpha        = 'auto',
    eta          = 'auto',
    random_state = 42
)

coherence = CoherenceModel(
                model=model,
                texts=df_translated_documents['tokens'],
                dictionary=dictionary,
                coherence='c_v'
            ).get_coherence()

perplexity = model.log_perplexity(heldout_corpus)

In [None]:
for i in range(model.num_topics):
    terms = [term for term, weight in model.show_topic(i, topn=20)]
    print(f"Topic {i+1}: {', '.join(terms)}")

In [None]:
#saving model, dict and corpora as an extra
model.save("best_ai_enviro_lda.model")
dictionary.save("best_ai_enviro.dict")
# optionally save the corpus in Matrix Market format:
from gensim import corpora
corpora.MmCorpus.serialize("best_ai_enviro.mm", train_corpus)

In [None]:
from nltk import sent_tokenize

mappings = []  # will hold (filename, chunk_id, chunk_text, topic_id, topic_prob)

for _, row in df_translated_documents.iterrows():
    text = row['text']
    # 1) Break into chunks:
    #    you can split on blank lines for paragraphs, or use sent_tokenize for sentences
    paragraphs = [p.strip() for p in text.split('\n\n') if len(p.strip())>0]
    for pid, para in enumerate(paragraphs):
        tokens = preprocess(para)
        bow    = dictionary.doc2bow(tokens)
        # 2) Get full topic distribution for this chunk
        dist   = best_model.get_document_topics(bow, minimum_probability=0)
        # 3) Pick the highest‐probability topic
        top_topic, top_prob = max(dist, key=lambda x: x[1])
        mappings.append({
            'filename':     row['filename'],
            'country':      row['country'],
            'chunk_id':     pid,
            'text_snippet': para[:200],   # first 200 chars
            'topic_id':     top_topic,
            'topic_prob':   top_prob
        })

In [None]:
# 4) Turn into a DataFrame
chunk_df = pd.DataFrame(mappings)

# 5) Filter only environment-related topics
env_topics = [0,2,12,15,20,21]  # whatever your env-topic IDs are
env_chunks = chunk_df[chunk_df['topic_id'].isin(env_topics)]

# 6) Save to CSV
chunk_df.to_csv('chunk_topic_map.csv', index=False)
env_chunks.to_csv('chunk_topic_map_environmental.csv', index=False)