<a href="https://colab.research.google.com/github/5eunji/Final-project-G3/blob/main/Wordclould_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Pre-Listening Activity: Learning New Words**
##1. Gradio Wordcloud App: Create a word cloud from the text to highlight the most frequent words.

In [None]:
!pip install matplotlib wordcloud nltk translate gradio pandas


import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import gradio as gr
import pandas as pd

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define example sentences, synonyms, and Korean meanings for the word list
word_data_examples = {
   "feud": ("The feud between the Montagues and Capulets caused much suffering.", "conflict, quarrel", "불화", "싸움"),
    "family": ("The Montague family was Romeo’s family.", "household, kin", "가족", "가문"),
    "party": ("Romeo secretly attended a Capulet party.", "gathering, celebration", "파티", "모임"),
    "love": ("Their love was pure and strong.", "affection, passion", "사랑", "애정"),
    "hate": ("The hate between the families was unending.", "anger, hostility", "증오", "미움"),
    "window": ("Romeo stood below Juliet’s window.", "pane, opening", "창문", "유리창"),
    "promise": ("Romeo promised to love Juliet forever.", "vow, pledge", "약속", "맹세"),
    "secret": ("Their love remained a secret.", "hidden, private", "비밀", "숨겨진"),
    "marry": ("They decided to marry despite their families’ feud.", "wed, unite", "결혼하다", "혼인하다"),
    "tragedy": ("Romeo and Juliet is a story of tragedy and love.", "disaster, misfortune", "비극", "참사")
}

# Words to be excluded from both the word cloud and the word list
exclude_words = set([
    'romeo', 'juliet', 'montague', 'capulet', 'oh', 'verona'
])

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def process_text(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words and word.lower() not in exclude_words]
    word_freq = Counter(words)
    pos_tags = nltk.pos_tag(words)
    return word_freq, pos_tags

def generate_wordcloud(word_freq):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig('wordcloud.png')
    return 'wordcloud.png'

def translate_and_get_pos(word_freq, pos_tags):
    pos_map = {
        'NN': 'n.', 'NNS': 'n.', 'NNP': 'n.', 'NNPS': 'n.', 'VB': 'v.', 'VBD': 'v. (과거형)', 'VBG': 'v. (ing형)',
        'VBN': 'v. (과거분사형/수동태)', 'VBP': 'v.', 'VBZ': 'v.', 'JJ': 'adj.', 'JJR': 'adj.', 'JJS': 'adj.',
        'RB': 'adv.', 'RBR': 'adv.', 'RBS': 'adv.', 'IN': 'prep.', 'DT': 'det.', 'CC': 'conj.',
        'UH': 'intj.'
    }

    seen_verbs = set()  # To track if we have already annotated specific verb forms
    word_data = []
    for word, freq in word_freq.items():
        if word not in word_data_examples:
            continue  # Skip if the word is not in the word_data_examples

        pos_list = [pos_map.get(pos_tag[1], 'N/A') for pos_tag in pos_tags if pos_tag[0] == word and pos_tag[1] in pos_map]
        pos_list = set(pos_list) if pos_list else {'N/A'}
        if 'N/A' in pos_list or word in exclude_words:
            continue  # Skip if no valid POS or excluded word
        pos_str = ", ".join(pos_list)

        # Check if the word is a specific verb form and get the base form
        lemmatized_word = word
        original_pos_tags = [pos_tag[1] for pos_tag in pos_tags if pos_tag[0] == word]
        for pos_tag in original_pos_tags:
            wn_pos = get_wordnet_pos(pos_tag)
            if wn_pos == wordnet.VERB:
                lemmatized_word = lemmatizer.lemmatize(word, wn_pos)
                if word != lemmatized_word and lemmatized_word not in seen_verbs:
                    if pos_tag.startswith('VBD'):
                        pos_str += f" (v. {lemmatized_word}의 과거형)"
                    elif pos_tag.startswith('VBG'):
                        pos_str += f" (v. {lemmatized_word}의 ing형)"
                    elif pos_tag.startswith('VBN'):
                        pos_str += f" (v. {lemmatized_word}의 과거분사형/수동태)"
                    seen_verbs.add(lemmatized_word)

        translation = f"{word_data_examples[word][2]}, {word_data_examples[word][3]}"
        example_sentence, synonyms = word_data_examples[word][:2]
        word_data.append((word, pos_str, translation, example_sentence, synonyms))

    # Sort the word data by frequency
    word_data.sort(key=lambda x: word_freq[x[0]], reverse=True)

    return word_data

def main(text):
    word_freq, pos_tags = process_text(text)
    wordcloud_image = generate_wordcloud(word_freq)
    word_data = translate_and_get_pos(word_freq, pos_tags)

    # Create a DataFrame to display the word data in a table format
    df = pd.DataFrame(word_data, columns=["어휘 (Word)", "범주 (Category)", "뜻 (Meaning)", "예문 (Example)", "동의어 (Synonyms)"])
    word_data_table = df.to_html(index=False, justify='center')

    return wordcloud_image, word_data_table

# Custom CSS for the Gradio interface
css = """
<style>
body {
    background-color: skyblue !important;
}
.gr-button {
    background-color: blue !important;
    border-color: blue !important;
}
table {
    width: 100%;
    border-collapse: collapse;
    text-align: center;
}
th, td {
    padding: 8px;
    border: 1px solid #ddd;
}
th {
    background-color: #f2f2f2;
}
</style>
"""

# Gradio interface
interface = gr.Interface(
    fn=main,
    inputs="text",
    outputs=["image", "html"],
    title="Wordcloud Vocabulary Learning App",
    description="Input text to generate a word cloud and a frequency list with Korean meanings, parts of speech, and example sentences."
     "<br><br><b>The full text:</b><br>"
     """<blockquote>Many years ago, in the city of Verona, Italy, there were two families, the Montagues and the Capulets. These two families were always battling and did not like each other.

One day, Romeo Montague secretly attended a Capulet party. There, he saw Juliet Capulet and instantly fell in love. However, their love was in danger because of their families’ feud. After the party, Romeo went to Juliet’s window, and they promised to love each other forever.

Despite their love, the feud between their families grew worse. Their story is one of love, tragedy, and heartbreak.<br><br><i>Copy and paste to try.</i></blockquote>""",
)

# Launch the interface and include the custom CSS
interface.launch()
gr.HTML(css)

