In [2]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Defining the stop words
stop_words = set(stopwords.words('english'))

# Creating an instance of LancasterStemmer
stemmer = PorterStemmer()

def remove_stopwords_and_stem(text):
    tokenized_text = word_tokenize(text)
    filtered_text = [stemmer.stem(word) for word in tokenized_text if not word in stop_words]
    return " ".join(filtered_text)

example_text = "This is a example text to demonstrate the removal of stop words and stemming."
processed_text = remove_stopwords_and_stem(example_text)
print(f"Original Text: {example_text}")
print(f"Processed Text: {processed_text}")

Original Text: This is a example text to demonstrate the removal of stop words and stemming.
Processed Text: thi exampl text demonstr remov stop word stem .


In [3]:
from nltk.stem import PorterStemmer

# List of sample words
example_words = ["connection", "connections", "connective", "connected", "connecting", "connection"]

# Create object of the Porter Stemmer Class
stemmer = PorterStemmer()

# Stem each word in the list of words
stemmed_words = [stemmer.stem(word) for word in example_words]

print("Stemmed words: ", stemmed_words)

Stemmed words:  ['connect', 'connect', 'connect', 'connect', 'connect', 'connect']


In [4]:
# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Load stop words from NLTK and initialize a stemmer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Define function for text cleaning and stemming
def clean_text(text):
    text = text.lower()  # Convert text to lower case
    text = re.sub(r'\S*@\S*\s?', '', text)  # Remove email addresses
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\d', ' ', text)  # Remove digits
    text = re.sub(r'\s\s+', ' ', text)  # Remove extra spaces

    tokenized_text = word_tokenize(text)
    filtered_text = [stemmer.stem(word) for word in tokenized_text if not word in stop_words]

    return " ".join(filtered_text)

# Fetching 20 newsgroups dataset and restricting to first 100 records for performance
newsgroups_data = fetch_20newsgroups(subset='all')['data'][:100]

# Clean and preprocess the newsgroup data
cleaned_data = [clean_text(data) for data in newsgroups_data]

# Setup the CountVectorizer to generate unigrams, bigrams, and trigrams
vectorizer = CountVectorizer(ngram_range=(2, 3)) 

# Apply the CountVectorizer on the cleaned data to create n-grams
X = vectorizer.fit_transform(cleaned_data)

# Display the shape of X and some features
print("Shape of X with n-grams: ", X.shape)
features = vectorizer.get_feature_names_out()
print("Features from index 100 to 110: ", features[100:111])

Shape of X with n-grams:  (100, 25111)
Features from index 100 to 110:  ['ac uk mani' 'academ comput' 'academ comput john' 'acceler chipset'
 'acceler chipset isa' 'acceler went' 'acceler went back'
 'accept argument' 'accept argument govern' 'accept notion'
 'accept notion time']


In [5]:
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer

# Simple short text
text = "I love studying data science"

# Setup the CountVectorizer to generate bigrams and trigrams
vectorizer = CountVectorizer(ngram_range=(2, 3)) 

# Apply the CountVectorizer on the text to create n-grams
X = vectorizer.fit_transform([text])

# Display the shape of X and some features
print("Shape of X with n-grams: ", X.shape)
features = vectorizer.get_feature_names_out()
print("Features: ", features)

Shape of X with n-grams:  (1, 5)
Features:  ['data science' 'love studying' 'love studying data' 'studying data'
 'studying data science']


In [21]:
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Define two simple short texts
text1 = "data love data science passion"
text2 = "science love algorithm data passion"

# Setup the CountVectorizer to generate unigrams only
vectorizer = CountVectorizer(ngram_range=(1, 1))

# Apply the CountVectorizer on the text to create n-grams
X = vectorizer.fit_transform([text1, text2])

# Display the shape of X and some features
print("Shape of X with n-grams: ", X.shape)
features = vectorizer.get_feature_names_out()
print("Features: ", np.sort(features))
print("Matrix X: ", X.toarray())

Shape of X with n-grams:  (2, 5)
Features:  ['algorithm' 'data' 'love' 'passion' 'science']
Matrix X:  [[0 2 1 1 1]
 [1 1 1 1 1]]


In [7]:
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Load stop words from NLTK and initialize a stemmer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Define function for text cleaning and stemming
def clean_text(text):
    text = text.lower()  # Convert text to lower case
    text = re.sub(r'\S*@\S*\s?', '', text)  # Remove email addresses
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\d', ' ', text)  # Remove digits
    text = re.sub(r'\s\s+', ' ', text)  # Remove extra spaces

    tokenized_text = word_tokenize(text)
    filtered_text = [stemmer.stem(word) for word in tokenized_text if not word in stop_words]

    return " ".join(filtered_text)

# Simple short sentences
text1 = "I love studying data science"
text2 = "Science is a field I am passionate about"
text3 = "Algorithms fascinate me"
text4 = "It is a joy to work on data algorithms"

# Clean and preprocess the text
cleaned_text_1 = clean_text(text1)
cleaned_text_2 = clean_text(text2)
cleaned_text_3 = clean_text(text3)
cleaned_text_4 = clean_text(text4)

# TODO: Setup the CountVectorizer to generate bigrams only
vectorizer = CountVectorizer(ngram_range=(2,2))
# TODO: Apply the CountVectorizer on the cleaned data to create n-grams
X = vectorizer.fit_transform([cleaned_text_1,cleaned_text_2,cleaned_text_3,cleaned_text_4])
# Display the shape of X and some features
print("Shape of X with n-grams: ", X.shape)
features = vectorizer.get_feature_names_out()
print("Features: ", features)
print("Matrix X: ", X.toarray())

Shape of X with n-grams:  (4, 19)
Features:  ['algorithm' 'algorithm fascin' 'data' 'data algorithm' 'data scienc'
 'fascin' 'field' 'field passion' 'joy' 'joy work' 'love' 'love studi'
 'passion' 'scienc' 'scienc field' 'studi' 'studi data' 'work' 'work data']
Matrix X:  [[0 0 1 0 1 0 0 0 0 0 1 1 0 1 0 1 1 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0]
 [1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1]]


In [23]:
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import numpy as np

# Load stop words from NLTK and initialize a stemmer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Define function for text cleaning and stemming
def clean_text(text):
    text = text.lower()  # Convert text to lower case
    text = re.sub(r'\S*@\S*\s?', '', text)  # Remove email addresses
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\d', ' ', text)  # Remove digits
    text = re.sub(r'\s\s+', ' ', text)  # Remove extra spaces

    tokenized_text = word_tokenize(text)
    filtered_text = [stemmer.stem(word) for word in tokenized_text if not word in stop_words]

    return " ".join(filtered_text)

# TODO: Fetch the 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='all')
# TODO: For performance consideration, limit to the first 100 documents
newsgroups_data = newsgroups_data['data'][:100]
# TODO: Clean and preprocess the Newsgroup data
cleaned_data = [clean_text(data) for data in newsgroups_data]
# TODO: Setup the CountVectorizer to generate both unigrams and bigrams
vectorizer = CountVectorizer(ngram_range=(1,2))
# TODO: Apply the CountVectorizer on the cleaned data to create n-grams
X = vectorizer.fit_transform(cleaned_data)
# TODO: Display the number of documents, the total number of features, and the last 10 features sorted alphabetically
features = vectorizer.get_feature_names_out()
print("Features: ", np.sort(features[-10:]))
print("The total number of features", len(features))

Features:  ['zero guess' 'zezel' 'zezel phi' 'zod' 'zoolog' 'zoolog depart' 'zur'
 'zur realisierung' 'zurich' 'zurich ch']
The total number of features 16246
