In [None]:
# Install dependencies
!pip install numpy scipy scikit-learn nltk spacy matplotlib gensim

# If glove-python is required, install it (may fail on newer Python versions)
!pip install git+https://github.com/maciejkula/glove-python.git

# Download GloVe pre-trained embeddings (e.g., 100-dimensional vectors)
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glove.6B

# Download NLTK data (if required)
import nltk
nltk.download('punkt')

# Set up SpaCy (if needed)
!python -m spacy download en_core_web_sm


In [None]:
# Import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
import spacy
from spacy import displacy
import matplotlib.pyplot as plt


In [None]:

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Sample text
text = """
Artificial intelligence (AI) is revolutionizing industries worldwide. Companies like OpenAI and Google are at the forefront, 
developing large language models such as ChatGPT and Bard. In 2024, advancements in AI are expected to enhance medical diagnoses, 
autonomous driving, and personalized education.
"""


In [None]:

# Step 1: Preprocessing with NLTK
def nltk_preprocessing(text):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    print("Sentences:", sentences)

    # Tokenize words
    words = word_tokenize(text)
    print("\nWords:", words)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word.lower() not in stop_words and word.isalnum()]
    print("\nFiltered Words:", filtered_words)

    return filtered_words


In [None]:

# Step 2: Word Frequency Distribution with NLTK
def nltk_word_frequency(words):
    freq_dist = FreqDist(words)
    print("\nWord Frequency:", freq_dist.most_common(5))

    # Visualize word frequency
    plt.figure(figsize=(10, 6))
    freq_dist.plot(10, cumulative=False)
    plt.title("Word Frequency Distribution")
    plt.show()


In [None]:

# Step 3: Entity Recognition and Visualization with spaCy
def spacy_ner_visualization(text):
    # Load spaCy's English model
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Display entities
    print("\nNamed Entities:")
    for ent in doc.ents:
        print(f"{ent.text} ({ent.label_})")

    # Visualize entities using spaCy's DisplaCy
    displacy.render(doc, style="ent", jupyter=False)



In [None]:
# Step 4: Part-of-Speech Tagging with spaCy
def spacy_pos_analysis(text):
    # Load spaCy's English model
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # POS tagging
    print("\nPart-of-Speech Tags:")
    for token in doc:
        print(f"{token.text}: {token.pos_} ({token.tag_})")

    # Visualize POS tagging (e.g., dependency parse tree)
    displacy.render(doc, style="dep", jupyter=False)



In [None]:
# Run the pipeline
print("\n--- NLTK Preprocessing ---")
filtered_words = nltk_preprocessing(text)
nltk_word_frequency(filtered_words)

print("\n--- spaCy NER Visualization ---")
spacy_ner_visualization(text)

print("\n--- spaCy POS Analysis ---")
spacy_pos_analysis(text)
