In [None]:
import re
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from collections import Counter
import nltk
from nltk.corpus import stopwords

In [None]:
try:
    nltk.data.find('corpora/stopwords.zip')
except LookupError:
    nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

In [None]:
def get_content(article_name):
    url = f"https://en.wikipedia.org/wiki/{article_name}"
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find(id='bodyContent').find_all('p')
        content = ' '.join([p.text for p in paragraphs])
        return content
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
def merge_contents(data):
    cleaned_data = re.sub(r'\[\[.*?\]\]|\{.*?\}|<.*?>', '', data)
    cleaned_data = re.sub(r'[^a-zA-Z\s]', '', cleaned_data)
    cleaned_data = cleaned_data.lower()
    return cleaned_data

In [None]:
def tokenize(content):
    return content.split()

In [None]:
def lower_collection(collection):
    return [word.lower() for word in collection]

In [None]:
def count_word_frequencies(text):
    return Counter(text)

In [None]:
def remove_stopwords(text):
    return [word for word in text if word not in stop_words and word.lower() not in ["nm", "uv", "cfcs", "also"]]

In [None]:
def plot_most_frequent_words(word_counts, n=25):
    top_words = dict(word_counts.most_common(n))
    sns.barplot(x=list(top_words.values()), y=list(top_words.keys()), palette="hsv")
    plt.title('Top 25 Most Frequent Words')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.show()

In [None]:
def main():
    article_name = "Ozone_layer"
    content = get_content(article_name)
    if content:
        cleaned_text = merge_contents(content)
        tokens = tokenize(cleaned_text)
        word_frequencies = count_word_frequencies(tokens)
        filtered_tokens = remove_stopwords(tokens)
        filtered_word_frequencies = count_word_frequencies(filtered_tokens)
        plot_most_frequent_words(filtered_word_frequencies)

if __name__ == "__main__":
    main()