In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import os
project_path = '/content/drive/MyDrive/NLP 11'
os.chdir(project_path)

In [33]:
!pip install gensim wordcloud nltk scikit-learn matplotlib




In [34]:
"""
How to use:
1. Save this file as solution.py
2. Put QTL_text.json and Trait_dictionary.txt in the same folder
3. Run in Spyder (or terminal): python solution.py
4. Results will be saved in ./outputs folder
"""

import os, json, re, string, logging
from collections import Counter

import nltk
# Minimal, correct downloads
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from wordcloud import WordCloud

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
INPUT_JSON = "QTL_text.json"
TRAIT_DICT = "Trait_dictionary.txt"
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

EN_STOPWORDS = set(stopwords.words("english"))
PUNCT_CHARS = set(string.punctuation)


def load_qtl(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [r["Abstract"] for r in data if r.get("Category") == "1"]

def load_traits(path):
    with open(path, "r", encoding="utf-8") as f:
        return {line.strip().lower() for line in f if line.strip()}

def preprocess(texts):
    """Tokenize, lowercase, remove stopwords & punctuation"""
    tokenized_docs, tokenized_sents = [], []
    for abs_text in texts:
        doc_tokens = []
        for sent in sent_tokenize(abs_text):
            tokens = [t.lower() for t in word_tokenize(sent)]
            tokens = [t for t in tokens if re.match(r"[a-z0-9\-]+", t)]
            tokens = [t for t in tokens if t not in EN_STOPWORDS and t not in PUNCT_CHARS]
            if tokens:
                tokenized_sents.append(tokens)
                doc_tokens.extend(tokens)
        if doc_tokens:
            tokenized_docs.append(doc_tokens)
    return tokenized_docs, tokenized_sents

def make_wordcloud(weights, filename):
    wc = WordCloud(width=800, height=800, background_color="white")
    wc.generate_from_frequencies(weights)
    out_path = os.path.join(OUT_DIR, filename)
    wc.to_file(out_path)
    logging.info("Saved %s", out_path)


def task1_wordclouds(tokenized_docs):
    term_freqs = Counter([t for doc in tokenized_docs for t in doc])
    make_wordcloud(term_freqs, "qtl_wordcloud_frequency.png")

    joined_texts = [" ".join(doc) for doc in tokenized_docs]
    tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    tfidf_matrix = tfidf_vectorizer.fit_transform(joined_texts)
    term_scores = tfidf_matrix.sum(axis=0).A1
    tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), term_scores))
    make_wordcloud(tfidf_scores, "qtl_wordcloud_tfidf.png")
    return tfidf_scores

# Task 2: Word2Vec
def task2_word2vec(tokenized_sents, tfidf_scores):
    w2v_model = Word2Vec(
        sentences=tokenized_sents,
        vector_size=100,
        window=5,
        min_count=10,
        workers=4
    )
    top10_terms = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:10]
    out_path = os.path.join(OUT_DIR, "qtl_similar_words_raw.txt")
    with open(out_path, "w", encoding="utf-8") as f:
        for word, _ in top10_terms:
            f.write(f"== {word} ==\n")
            if word in w2v_model.wv:
                for w, s in w2v_model.wv.most_similar(word, topn=20):
                    f.write(f"{w}\t{s:.4f}\n")
            else:
                f.write("(not in vocab)\n")
            f.write("\n")
    logging.info("Saved %s", out_path)

def task3_phrases(tokenized_sents, tokenized_docs, trait_set):

    bigram_phr = Phraser(Phrases(tokenized_sents, min_count=5, threshold=5.0))
    # Apply bigrams first
    sents_bi = [bigram_phr[s] for s in tokenized_sents]
    docs_bi = [bigram_phr[d] for d in tokenized_docs]

    # Learn trigrams
    trigram_phr = Phraser(Phrases(sents_bi, min_count=5, threshold=5.0))
    # Apply trigrams
    sents_bi_tri = [trigram_phr[s] for s in sents_bi]
    docs_bi_tri = [trigram_phr[d] for d in docs_bi]

    logging.info("Bigrams and trigrams applied to corpus.")

    # Word clouds
    term_freqs_phr = Counter([t for doc in docs_bi_tri for t in doc])
    make_wordcloud(term_freqs_phr, "qtl_phrase_wordcloud_frequency.png")

    joined_texts_phr = [" ".join(doc) for doc in docs_bi_tri]
    tfidf_vectorizer_phr = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    tfidf_matrix_phr = tfidf_vectorizer_phr.fit_transform(joined_texts_phr)
    term_scores_phr = tfidf_matrix_phr.sum(axis=0).A1
    tfidf_scores_phr = dict(zip(tfidf_vectorizer_phr.get_feature_names_out(), term_scores_phr))
    make_wordcloud(tfidf_scores_phr, "qtl_phrase_wordcloud_tfidf.png")

    # Word2Vec on phrase-applied sentences
    w2v_model_phr = Word2Vec(
        sentences=sents_bi_tri,
        vector_size=100,
        window=5,
        min_count=10,
        workers=4
    )
    top10_terms_phr = sorted(tfidf_scores_phr.items(), key=lambda x: x[1], reverse=True)[:10]
    out_sim_path = os.path.join(OUT_DIR, "qtl_similar_words_phrases.txt")
    with open(out_sim_path, "w", encoding="utf-8") as f:
        for word, _ in top10_terms_phr:
            f.write(f"== {word} ==\n")
            if word in w2v_model_phr.wv:
                for w, s in w2v_model_phr.wv.most_similar(word, topn=20):
                    f.write(f"{w}\t{s:.4f}\n")
            else:
                f.write("(not in vocab)\n")
            f.write("\n")

    # Dictionary match (normalize underscores -> spaces)
    extracted_phrases_set = {t for doc in docs_bi_tri for t in doc if "_" in t}
    normalized_phrases_set = {p.replace("_", " ").lower() for p in extracted_phrases_set}
    dictionary_matches = [p for p in normalized_phrases_set if p in trait_set]

    out_match_path = os.path.join(OUT_DIR, "qtl_trait_dictionary_match_gensim.txt")
    with open(out_match_path, "w", encoding="utf-8") as f:
        f.write(f"Total extracted phrases: {len(normalized_phrases_set)}\n")
        f.write(f"Matches in trait dictionary: {len(dictionary_matches)}\n")
        f.write("Matched terms:\n")
        for m in sorted(dictionary_matches):
            f.write(m + "\n")
    logging.info("Saved %s", out_match_path)


# Task 4: NP Chunking + dictionary match
def task4_np_chunking(tokenized_sents, trait_set):
    grammar = r"NP: {<DT>?<JJ>{0,2}<NN.*>+}"
    cp = nltk.RegexpParser(grammar)

    np_extracted_phrases = []
    for sent in tokenized_sents:
        tagged = nltk.pos_tag(sent)
        tree = cp.parse(tagged)
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
            phrase = " ".join(word for word, pos in subtree.leaves())
            np_extracted_phrases.append(phrase.lower())

    unique_np_phrases = set(np_extracted_phrases)
    np_dictionary_matches = [p for p in unique_np_phrases if p in trait_set]

    out_np_path = os.path.join(OUT_DIR, "qtl_trait_dictionary_match_npchunk.txt")
    with open(out_np_path, "w", encoding="utf-8") as f:
        f.write(f"Total extracted NP phrases: {len(unique_np_phrases)}\n")
        f.write(f"Matches in trait dictionary: {len(np_dictionary_matches)}\n")
        f.write("Matched terms:\n")
        for m in sorted(np_dictionary_matches):
            f.write(m + "\n")

    logging.info("Saved %s", out_np_path)


# Main
def main():
    qtl_abstracts = load_qtl(INPUT_JSON)
    trait_set = load_traits(TRAIT_DICT)
    tokenized_docs, tokenized_sents = preprocess(qtl_abstracts)

    tfidf_scores = task1_wordclouds(tokenized_docs)
    task2_word2vec(tokenized_sents, tfidf_scores)
    task3_phrases(tokenized_sents, tokenized_docs, trait_set)  # now includes trigrams internally
    task4_np_chunking(tokenized_sents, trait_set)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
