In [1]:
# Import necessary Libraries
import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Load IMDB Movie Reviews Dataset
nltk.download('movie_reviews', quiet=True)

# Working with first 100 reviews
first_100_reviewids = movie_reviews.fileids()[:100]
reviews = [movie_reviews.raw(fileid) for fileid in first_100_reviewids]

# Transform raw data into TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(reviews)
print(f"Shape of the features matrix before dimensionality reduction: {tfidf_matrix.shape}\n")

# Apply TruncatedSVD for Dimensionality Reduction
svd = TruncatedSVD(n_components=50)  
features = svd.fit_transform(tfidf_matrix)

# Print shape after dimensionality reduction
print(f"Shape of the features matrix after dimensionality reduction: {features.shape}")

Shape of the features matrix before dimensionality reduction: (100, 8865)

Shape of the features matrix after dimensionality reduction: (100, 50)


In [2]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Sample sentences
sentences = ["The dog barked at the mail carrier", 
             "The mail carrier was afraid of the small dog", 
             "The small dog was not ferocious but it did like to bark"]

# Transform sentences into TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(sentences)
print(f"Shape of the features matrix before dimensionality reduction: {tfidf_matrix.shape}\n")

# Initialize TruncatedSVD with desired number of components
svd = TruncatedSVD(n_components=2)

# Apply TruncatedSVD on the TF-IDF matrix
features = svd.fit_transform(tfidf_matrix)

# Print shape after dimensionality reduction
print(f"Shape of the features matrix after dimensionality reduction: {features.shape}")
print(f"Reduced features matrix: \n{features}")

Shape of the features matrix before dimensionality reduction: (3, 10)

Shape of the features matrix after dimensionality reduction: (3, 2)
Reduced features matrix: 
[[ 0.81683228 -0.37215178]
 [ 0.87059126 -0.11198387]
 [ 0.455513    0.881375  ]]


In [3]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

# Sample sentences
sentences = ["The cat meowed at the room window", 
             "The small kitten looked at the window and meowed", 
             "The ginger kitten didn't meow but it looked at the window"]

# Transform sentences into Bag-of-Words matrix
vectorizer = CountVectorizer(stop_words='english')
bow_matrix = vectorizer.fit_transform(sentences)
print(f"Shape of the features matrix before dimensionality reduction: {bow_matrix.shape}\n")

# Initialize TruncatedSVD with desired number of components
svd = TruncatedSVD(n_components=2)

# Apply TruncatedSVD on the Bag-of-Words matrix
features = svd.fit_transform(bow_matrix)

# Print shape after dimensionality reduction
print(f"Shape of the features matrix after dimensionality reduction: {features.shape}")
print(f"Reduced features matrix: \n{features}")

Shape of the features matrix before dimensionality reduction: (3, 10)

Shape of the features matrix after dimensionality reduction: (3, 2)
Reduced features matrix: 
[[ 1.11616594  1.49849714]
 [ 1.94634624  0.37713888]
 [ 2.07711431 -1.15863355]]


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

# Load stop words from NLTK and initialize a stemmer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Define function for text cleaning and stemming
def clean_text(text):
    text = text.lower()  # Convert text to lower case
    text = re.sub(r'\W', ' ', text)  # Remove punctuation and special characters
    tokenized_text = word_tokenize(text)
    filtered_text = [stemmer.stem(word) for word in tokenized_text if not word in stop_words]
    return " ".join(filtered_text)

original_sentences = ['It is a lovely day, isn\'t it?', 
                      'The sun is shining brightly!', 
                      'I love the taste of lemonade on a sunny day.']
# Preprocess the sentences
preprocessed_sentences = [clean_text(sentence) for sentence in original_sentences]

# Initialize a CountVectorizer
vectorizer = CountVectorizer()

# Fit transform the preprocessed sentences
X = vectorizer.fit_transform(preprocessed_sentences)

svd = TruncatedSVD(n_components=2)

# Apply TruncatedSVD on the Bag-of-Words matrix
X_reduced = svd.fit_transform(X)

print(f"Original BoW shape: {X.shape}")
print(f"Shape after TruncatedSVD: {X_reduced.shape}")
print(f"Reduced features matrix: \n{X_reduced}")

Original BoW shape: (3, 8)
Shape after TruncatedSVD: (3, 2)
Reduced features matrix: 
[[ 1.09544512e+00 -7.09511536e-18]
 [-6.70521600e-16  1.73205081e+00]
 [ 2.19089023e+00  7.63062702e-16]]


In [6]:
# TODO: Import the necessary libraries
import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# TODO: Load the IMDB Movie Reviews dataset
nltk.download('movie_reviews', quiet=True)
# TODO: Load the first 150 reviews from the dataset
first_100_reviewids = movie_reviews.fileids()[:150]
reviews = [movie_reviews.raw(fileid) for fileid in first_100_reviewids]
# TODO: Transform the movie reviews into a TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(reviews)
print(f"Shape of the features matrix before dimensionality reduction: {tfidf_matrix.shape}\n")
# TODO: Initialize and apply TruncatedSVD to the TF-IDF matrix
svd = TruncatedSVD(n_components=100)  
features = svd.fit_transform(tfidf_matrix)
# TODO: Print the shape of the features matrix before and after applying TruncatedSVD
print(f"Shape of the features matrix after dimensionality reduction: {features.shape}")

Shape of the features matrix before dimensionality reduction: (150, 11138)

Shape of the features matrix after dimensionality reduction: (150, 100)
