In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

In [3]:
pip install pandas numpy scikit-learn nltk

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [4]:
# Download necessary NLTK resources
# These downloads are required for tokenization, stopwords, and lemmatization
print("Step 0: Downloading NLTK resources (This runs automatically)...")
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    print("NLTK downloads complete.")
except Exception as e:
    print(f"Error during NLTK download: {e}. Please ensure you have an internet connection.")

Step 0: Downloading NLTK resources (This runs automatically)...
NLTK downloads complete.


In [11]:
# --- Data Loading ---
print("\nStep 1: Loading Dataset ('data.csv')")
def load_data_with_encoding(file_path):
    """Attempts to load a CSV file using multiple common encodings to handle UnicodeDecodeError."""
    encodings_to_try = ['utf-8', 'latin-1', 'cp1252']
    for encoding in encodings_to_try:
        try:
            # Attempt to load the user's specified CSV file with the current encoding
            df = pd.read_csv(file_path, encoding=encoding)
            print(f"Successfully loaded data from '{file_path}' using encoding: '{encoding}'.")
            return df
        except UnicodeDecodeError:
            print(f"Failed to load with encoding: '{encoding}'. Trying next...")
        except FileNotFoundError:
            # Re-raise FileNotFoundError if the file itself isn't found
            raise FileNotFoundError(f"The file '{file_path}' was not found.")
    
    # If all attempts fail, return simulated data
    print("WARNING: All common encoding attempts failed or 'data.csv' not found. Using simulated data for demonstration.")
    data = {
        'InvoiceNo': [536365, 536365, 536366, 536367, 536367, 536368],
        'StockCode': ['85123A', '71053', '22632', '22745', '22748', '22310'],
        'Description': [
            'WHITE HANGING HEART T-LIGHT HOLDER',
            'WHITE METAL LANTERN',
            'HAND WARMER RED POLKA DOT',
            "POPPY'S PATCHWORK BAG",
            "POPPY'S PATCHWORK KIT",
            'IVORY KNITTED MUG COVER'
        ],
        'Quantity': [6, 6, 6, 6, 6, 6],
        'UnitPrice': [2.55, 3.39, 1.85, 2.10, 2.10, 1.65],
        'CustomerID': [17850, 17850, 17850, 13047, 13047, 13047],
        'Country': ['United Kingdom'] * 6
    }
    return pd.DataFrame(data)
# Call the robust loading function
try:
    df = load_data_with_encoding('data.csv')
except FileNotFoundError as e:
    print(e)
    # Load simulated data if file is not found (and loading function didn't already handle it)
    df = load_data_with_encoding(None) 


# Ensure the 'Description' column is used for NLP operations
TEXT_COLUMN = 'Description'
# Drop rows where Description is missing, as we can't process them
df.dropna(subset=[TEXT_COLUMN], inplace=True)

print(f"Dataset loaded with {df.shape[0]} rows. Using column '{TEXT_COLUMN}' for NLP.")
print("-" * 80)


Step 1: Loading Dataset ('data.csv')
Failed to load with encoding: 'utf-8'. Trying next...
Successfully loaded data from 'data.csv' using encoding: 'latin-1'.
Dataset loaded with 540455 rows. Using column 'Description' for NLP.
--------------------------------------------------------------------------------


In [15]:
import nltk

In [None]:
# =========================================================================
# I. CLASS TASK: TOKENIZATION AND WORD EMBEDDINGS
# =========================================================================

In [20]:
# --- Step 2: Tokenization ---
print("\nStep 2: Tokenization (Breaking text into individual words/tokens)")
# Tokenization is the fundamental NLP process of splitting a sequence of text into smaller units 
# called tokens (usually words or punctuation). This converts continuous text into a list 
# of discrete items for machine processing.

def perform_tokenization(text):
    """Uses NLTK's word_tokenize to split text after converting to lowercase."""
    return word_tokenize(str(text).lower())

# Apply tokenization to the 'Description' column
try:
    df['tokens'] = df[TEXT_COLUMN].apply(perform_tokenization)
except LookupError as e:
    print("\n-------------------------------------------------------------")
    print("TOKENIZATION FAILED DUE TO MISSING NLTK RESOURCE.")
    print("Please manually run the following in a Python terminal or notebook cell:")
    print(">>> import nltk")
    print(">>> nltk.download('punkt')")
    print(f"\nOriginal Error: {e}")
    print("-------------------------------------------------------------")
    # If tokenization fails, we stop the notebook here to prevent further errors.
    # In a true notebook, you might handle this by assigning an empty list.
    df['tokens'] = [[]] * len(df) # Fallback to prevent immediate crash, though subsequent steps will be affected.
    
sample_text = df[TEXT_COLUMN].iloc[0] if len(df) > 0 else "Sample text missing"
sample_tokens = df['tokens'].iloc[0] if len(df) > 0 and isinstance(df['tokens'].iloc[0], list) else ["tokens", "missing"]

print(f"Original Text Sample:\n'{sample_text}'")
print(f"\nResulting Tokens:\n{sample_tokens}")
print("-" * 50)



Step 2: Tokenization (Breaking text into individual words/tokens)
Original Text Sample:
'WHITE HANGING HEART T-LIGHT HOLDER'

Resulting Tokens:
['white', 'hanging', 'heart', 't-light', 'holder']
--------------------------------------------------


In [21]:
# --- Step 3: Word Embeddings (Conceptual Demonstration) ---
print("\nStep 3: Word Embeddings (Conceptual Explanation and Basic Vectorization)")
# Word Embeddings are dense, low-dimensional vector representations of words. 
# They are designed to capture semantic and syntactic relationships, meaning words with similar 
# meanings are closer together in the vector space (e.g., 'king' and 'queen').

print("\n--- Conceptual Explanation ---")
print("True Word Embeddings (like Word2Vec, GloVe, or BERT) require large pre-trained models,")
print("which cannot be loaded here. The following demonstrates the idea of turning words into vectors")
print("using a simple frequency method, which is the precursor to modern embeddings.")

# Create a vocabulary from all tokens
all_tokens = [token for sublist in df['tokens'] for token in sublist]
vocab = list(set(all_tokens))

# Create a simple frequency vector for the first sample text
word_counts = Counter(sample_tokens)
sample_vector = [word_counts.get(word, 0) for word in vocab]

print("\n--- Basic Vectorization Illustration (Frequency Encoding) ---")
print(f"Vocabulary Size (Unique words in sample data): {len(vocab)}")
print(f"Word: '{sample_tokens[1]}' -> Frequency in sample text: {word_counts[sample_tokens[1]]}")
print(f"Vector Snippet for Sample Text (showing first 10 dimensions):\n{sample_vector[:10]}")
print("-" * 80)



Step 3: Word Embeddings (Conceptual Explanation and Basic Vectorization)

--- Conceptual Explanation ---
True Word Embeddings (like Word2Vec, GloVe, or BERT) require large pre-trained models,
which cannot be loaded here. The following demonstrates the idea of turning words into vectors
using a simple frequency method, which is the precursor to modern embeddings.

--- Basic Vectorization Illustration (Frequency Encoding) ---
Vocabulary Size (Unique words in sample data): 2449
Word: 'hanging' -> Frequency in sample text: 1
Vector Snippet for Sample Text (showing first 10 dimensions):
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------------------------------------------


In [None]:
# =========================================================================
# II. ASSIGNMENT 11: NLP PREPROCESSING AND TF-IDF
# =========================================================================

In [22]:
# Instantiate tools for preprocessing
lemmatizer = WordNetLemmatizer()
ENGLISH_STOPWORDS = set(stopwords.words('english'))

def preprocess_text_for_tfidf(text):
    """
    Applies the full preprocessing pipeline: Cleaning, Tokenization, Stopword Removal, and Lemmatization.
    """
    text = str(text).lower()
    
    # --- Step 4a: Cleaning ---
    # Removal of punctuation, numbers, and non-alphabetic characters. This ensures only words remain.
    text = re.sub(r'[^a-z\s]', '', text)

    # --- Step 4b: Tokenization and Stopword Removal ---
    # We re-tokenize after cleaning. Stopwords (common words like 'a', 'the', 'is') are removed 
    # because they don't carry much meaning for classification/regression.
    # We include a try-except here as well, to catch the 'punkt' error if it wasn't fixed.
    try:
        tokens = word_tokenize(text)
    except LookupError:
        tokens = text.split() # Fallback to simple split if NLTK model is missing

    filtered_tokens = [w for w in tokens if w not in ENGLISH_STOPWORDS]

    # --- Step 4c: Lemmatization ---
    # Lemmatization reduces a word to its base or dictionary form (e.g., 'running' -> 'run', 'feet' -> 'foot').
    # This helps in reducing the vocabulary size and ensures different forms of the same word are treated equally.
    lemmas = [lemmatizer.lemmatize(w) for w in filtered_tokens]

    return " ".join(lemmas)


# --- Step 4: Apply Full Preprocessing Pipeline ---
print("\nStep 4: Applying Full Preprocessing Pipeline (Cleaning, Stopword Removal, Lemmatization)")
df['processed_text'] = df[TEXT_COLUMN].apply(preprocess_text_for_tfidf)

print(f"Original Text Sample:\n'{sample_text}'")
print(f"\nProcessed Text Sample (ready for TF-IDF):\n'{df['processed_text'].iloc[0]}'")
print("-" * 50)



Step 4: Applying Full Preprocessing Pipeline (Cleaning, Stopword Removal, Lemmatization)
Original Text Sample:
'WHITE HANGING HEART T-LIGHT HOLDER'

Processed Text Sample (ready for TF-IDF):
'white hanging heart tlight holder'
--------------------------------------------------


In [23]:
# --- Step 5: TF-IDF Vectorization ---
print("\nStep 5: Applying TF-IDF Vectorization")
# TF-IDF (Term Frequency-Inverse Document Frequency) assigns a weight to each word based on two factors:
# 1. Term Frequency (TF): How often the word appears in the current document.
# 2. Inverse Document Frequency (IDF): How rare the word is across the entire dataset.
# Words that are frequent in a specific description but rare overall receive a high weight.
# This results in a matrix where each row is a document and each column is a feature (word), with values being the TF-IDF scores. 

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=100)

# Fit the vectorizer on the processed text (learning the vocabulary and calculating IDF values)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

# Convert the resulting sparse matrix to a DataFrame for inspection
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print(f"TF-IDF Matrix Shape: {tfidf_df.shape} (Rows: documents, Columns: features/words)")
print("\nFirst 5 rows of the resulting TF-IDF feature matrix:")
print(tfidf_df.head())
print("-" * 50)


Step 5: Applying TF-IDF Vectorization
TF-IDF Matrix Shape: (540455, 100) (Rows: documents, Columns: features/words)

First 5 rows of the resulting TF-IDF feature matrix:
   antique  apple  assorted  bag  bird  birthday  black  blue    bottle  bowl  \
0      0.0    0.0       0.0  0.0   0.0       0.0    0.0   0.0  0.000000   0.0   
1      0.0    0.0       0.0  0.0   0.0       0.0    0.0   0.0  0.000000   0.0   
2      0.0    0.0       0.0  0.0   0.0       0.0    0.0   0.0  0.000000   0.0   
3      0.0    0.0       0.0  0.0   0.0       0.0    0.0   0.0  0.470322   0.0   
4      0.0    0.0       0.0  0.0   0.0       0.0    0.0   0.0  0.000000   0.0   

   ...  vintage  wall     water     white  wicker  wood  wooden  woodland  \
0  ...      0.0   0.0  0.000000  0.444535     0.0   0.0     0.0       0.0   
1  ...      0.0   0.0  0.000000  0.707222     0.0   0.0     0.0       0.0   
2  ...      0.0   0.0  0.000000  0.000000     0.0   0.0     0.0       0.0   
3  ...      0.0   0.0  0.490409  0