<a href="https://colab.research.google.com/github/AmirishettyAkhila/2303A51L04/blob/main/Untitled32.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Ensure required NLTK data is downloaded
nltk.download('punkt')

# Initialize stemmer
stemmer = PorterStemmer()

def tokenize_text(text):
    """ Tokenize text into words and sentences """
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    return words, sentences

def stem_words(words):
    """ Stem a list of words """
    return [stemmer.stem(word) for word in words]

def represent_words(texts):
    """ Represent words using Count Vectorizer and TF-IDF Vectorizer """
    count_vectorizer = CountVectorizer()
    tfidf_vectorizer = TfidfVectorizer()

    count_matrix = count_vectorizer.fit_transform(texts)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

    return count_vectorizer.get_feature_names_out(), count_matrix.toarray(), tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray()

def represent_sentences(texts):
    """ Represent sentences using Count Vectorizer and TF-IDF Vectorizer """
    count_vectorizer = CountVectorizer()
    tfidf_vectorizer = TfidfVectorizer()

    count_matrix = count_vectorizer.fit_transform(texts)
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

    return count_vectorizer.get_feature_names_out(), count_matrix.toarray(), tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray()

def main():
    # Example text for tokenization
    text = "Hello there! How are you doing today? NLP is fascinating."

    # Tokenization
    words, sentences = tokenize_text(text)
    print("Words:", words)
    print("Sentences:", sentences)

    # Example words for stemming
    words_to_stem = ["running", "ran", "runs", "easily", "fairly"]

    # Stemming
    stemmed_words = stem_words(words_to_stem)
    print("\nOriginal Words:", words_to_stem)
    print("Stemmed Words:", stemmed_words)

    # Example texts for representation
    texts_for_representation = [
        "NLP is fun and interesting.",
        "NLP involves linguistics and computer science."
    ]

    # Word representation
    word_features, count_matrix, tfidf_features, tfidf_matrix = represent_words(texts_for_representation)
    print("\nWord Representation:")
    print("Count Vectorizer Features:", word_features)
    print("Count Matrix:\n", count_matrix)
    print("TF-IDF Vectorizer Features:", tfidf_features)
    print("TF-IDF Matrix:\n", tfidf_matrix)

    # Sentence representation
    sentence_features, count_matrix, tfidf_features, tfidf_matrix = represent_sentences(texts_for_representation)
    print("\nSentence Representation:")
    print("Count Vectorizer Features:", sentence_features)
    print("Count Matrix:\n", count_matrix)
    print("TF-IDF Vectorizer Features:", tfidf_features)
    print("TF-IDF Matrix:\n", tfidf_matrix)

if __name__ == "__main__":
    main()


Words: ['Hello', 'there', '!', 'How', 'are', 'you', 'doing', 'today', '?', 'NLP', 'is', 'fascinating', '.']
Sentences: ['Hello there!', 'How are you doing today?', 'NLP is fascinating.']

Original Words: ['running', 'ran', 'runs', 'easily', 'fairly']
Stemmed Words: ['run', 'ran', 'run', 'easili', 'fairli']

Word Representation:
Count Vectorizer Features: ['and' 'computer' 'fun' 'interesting' 'involves' 'is' 'linguistics' 'nlp'
 'science']
Count Matrix:
 [[1 0 1 1 0 1 0 1 0]
 [1 1 0 0 1 0 1 1 1]]
TF-IDF Vectorizer Features: ['and' 'computer' 'fun' 'interesting' 'involves' 'is' 'linguistics' 'nlp'
 'science']
TF-IDF Matrix:
 [[0.35520009 0.         0.49922133 0.49922133 0.         0.49922133
  0.         0.35520009 0.        ]
 [0.31779954 0.44665616 0.         0.         0.44665616 0.
  0.44665616 0.31779954 0.44665616]]

Sentence Representation:
Count Vectorizer Features: ['and' 'computer' 'fun' 'interesting' 'involves' 'is' 'linguistics' 'nlp'
 'science']
Count Matrix:
 [[1 0 1 1 0 1 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
