In [32]:
# Import TfidfVectorizer from the sklearn library to convert text data into TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a corpus, which is a list of documents (strings) to analyze
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating pilav"
]

# Create an instance of TfidfVectorizer which will be used to convert the corpus into a TF-IDF matrix
v = TfidfVectorizer()

# Fit the TfidfVectorizer to the corpus and transform the corpus into a TF-IDF matrix
transform_output = v.fit_transform(corpus)

# Print the vocabulary, which is a dictionary mapping each word in the corpus to its index in the TF-IDF matrix
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 16, 'is': 15, 'ironman': 14, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 19, 'iphone': 13, 'tomorrow': 26, 'tesla': 24, 'model': 18, 'google': 12, 'pixel': 21, 'microsoft': 17, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'pilav': 20}


In [33]:
# Get a list of all the feature names (words) from the fitted TfidfVectorizer
all_feature_name = v.get_feature_names_out()

# Simply display the feature names as an array
all_feature_name

array(['already', 'am', 'amazon', 'and', 'announcing', 'apple', 'are',
       'ate', 'biryani', 'dot', 'eating', 'eco', 'google', 'iphone',
       'ironman', 'is', 'loki', 'microsoft', 'model', 'new', 'pilav',
       'pixel', 'pizza', 'surface', 'tesla', 'thor', 'tomorrow', 'you'],
      dtype=object)

In [34]:
all_feature_name = v.get_feature_names_out()

# Loop through each word (feature) in the TF-IDF vocabulary
for word in all_feature_name:
    # Get the index of the word in the TF-IDF matrix from the vocabulary
    index = v.vocabulary_.get(word)
    # Print the word along with its Inverse Document Frequency (IDF) score
    print(f"{word} {v.idf_[index]}")

already 2.386294361119891
am 2.386294361119891
amazon 2.386294361119891
and 2.386294361119891
announcing 1.2876820724517808
apple 2.386294361119891
are 2.386294361119891
ate 2.386294361119891
biryani 2.386294361119891
dot 2.386294361119891
eating 1.9808292530117262
eco 2.386294361119891
google 2.386294361119891
iphone 2.386294361119891
ironman 2.386294361119891
is 1.1335313926245225
loki 2.386294361119891
microsoft 2.386294361119891
model 2.386294361119891
new 1.2876820724517808
pilav 2.386294361119891
pixel 2.386294361119891
pizza 2.386294361119891
surface 2.386294361119891
tesla 2.386294361119891
thor 2.386294361119891
tomorrow 1.2876820724517808
you 2.386294361119891


In [35]:
# Display the first two documents in the corpus
corpus[:2]

['Thor eating pizza, Loki is eating pizza, Ironman ate pizza already',
 'Apple is announcing new iphone tomorrow']

In [36]:
# Convert the TF-IDF matrix to a dense array and display the TF-IDF scores of the first two documents
transform_output.toarray()[:2]

array([[0.24266547, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.24266547, 0.        , 0.        ,
        0.40286636, 0.        , 0.        , 0.        , 0.24266547,
        0.11527033, 0.24266547, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.72799642, 0.        , 0.        ,
        0.24266547, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.30652086,
        0.5680354 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.5680354 , 0.        ,
        0.26982671, 0.        , 0.        , 0.        , 0.30652086,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.30652086, 0.        ]])

In [38]:
# Import the pandas library for handling data in tabular format
import pandas as pd

# Load a CSV file containing e-commerce data into a DataFrame
df = pd.read_csv("Ecommerce_data.csv")

# Print the shape (number of rows and columns) of the DataFrame
print(df.shape)

# Display the first few rows of the DataFrame
df.head()

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [39]:
# Display the count of each label in the 'label' column
df.label.value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [40]:
# Map the text labels to numerical labels and store them in a new column 'label_num'
df["label_num"] = df.label.map({
    "Household": 0,
    "Electronics": 1,
    "Clothing & Accessories": 2,
    "Books": 3,
})

# Display the first few rows of the DataFrame with the new 'label_num' column
df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


In [44]:
# Import train_test_split for splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets, with stratification based on the label
X_train, X_test, y_train, y_test = train_test_split(
    df.Text,              # Input features (text data)
    df.label_num,         # Target labels (numerical labels)
    test_size=0.2,        # 20% of the data will be used for testing
    random_state=2022,    # Set random seed for reproducibility
    stratify=df.label_num # Ensure the split maintains the distribution of labels
)

# Print the shape (number of rows and columns) of the training data
print("Shape of X_train:", X_train.shape)

# Print the shape of the testing data
print("Shape of X_test:", X_test.shape)

# Display the count of each label in the training set
y_train.value_counts()

Shape of X_train: (19200,)
Shape of X_test: (4800,)


label_num
0    4800
2    4800
3    4800
1    4800
Name: count, dtype: int64

In [45]:
# Display the count of each label in the testing set
y_test.value_counts()

label_num
0    1200
2    1200
3    1200
1    1200
Name: count, dtype: int64

In [46]:
# Import KNeighborsClassifier for classification using the k-nearest neighbors algorithm
from sklearn.neighbors import KNeighborsClassifier

# Import Pipeline to streamline the creation of machine learning workflows
from sklearn.pipeline import Pipeline

# Import classification_report to evaluate the performance of the model
from sklearn.metrics import classification_report

# Create a pipeline with two steps: TF-IDF vectorization and KNN classification
clf = Pipeline([
    ("vectorizer_tfidf", TfidfVectorizer()),    # Step 1: Convert text data to TF-IDF features
    ("KNN", KNeighborsClassifier())             # Step 2: Apply KNN classification
])

# Fit the pipeline (train the model) on the training data
clf.fit(X_train, y_train)

# Predict the labels for the testing data using the trained model
y_pred = clf.predict(X_test)

# Print the classification report to evaluate the performance of the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.96      0.97      0.97      1200
           2       0.97      0.98      0.98      1200
           3       0.98      0.95      0.96      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [49]:
# Display the first 5 samples of the testing data
X_test[:5]

20706    Lal Haveli Designer Handmade Patchwork Decorat...
19008    tirupur fashion biz Girls and Kids Solid Cotto...
14810    Modern Linguistics: An Introduction About The ...
2451     AmazonBasics Apple Certified 30-Pin to USB Cab...
6296     The Marine Corps Martial Arts Program: The Com...
Name: Text, dtype: object

In [50]:
# Display the true labels of the first 5 samples in the testing data
y_test[:5]

20706    0
19008    2
14810    3
2451     1
6296     3
Name: label_num, dtype: int64

In [51]:
# Display the predicted labels of the first 5 samples in the testing data
y_pred[:5]

array([0, 2, 3, 1, 3])

In [52]:
# Import MultinomialNB for Naive Bayes classification
from sklearn.naive_bayes import MultinomialNB

# Create a pipeline with TF-IDF vectorization and Multinomial Naive Bayes classification
clf = Pipeline([
    ("vectorizer_tfidf", TfidfVectorizer()),    # Step 1: Convert text data to TF-IDF features
    ("Multi NB", MultinomialNB())               # Step 2: Apply Multinomial Naive Bayes classification
])

# Fit the pipeline on the training data
clf.fit(X_train, y_train)

# Predict the labels for the testing data
y_pred = clf.predict(X_test)

# Print the classification report to evaluate the performance of the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.93      0.96      0.95      1200
           1       0.96      0.96      0.96      1200
           2       0.97      0.98      0.98      1200
           3       0.98      0.93      0.95      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [53]:
# Import RandomForestClassifier for classification using the Random Forest algorithm
from sklearn.ensemble import RandomForestClassifier

# Create a pipeline with TF-IDF vectorization and Random Forest classification
clf = Pipeline([
    ("vectorizer_tfidf", TfidfVectorizer()),    # Step 1: Convert text data to TF-IDF features
    ("Random Forest", RandomForestClassifier()) # Step 2: Apply Random Forest classification
])

# Fit the pipeline on the training data
clf.fit(X_train, y_train)

# Predict the labels for the testing data
y_pred = clf.predict(X_test)

# Print the classification report to evaluate the performance of the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.97      0.97      0.97      1200
           2       0.98      0.98      0.98      1200
           3       0.98      0.97      0.97      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [23]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ("vectorizer_tfidf", TfidfVectorizer()),
    ("Multi NB", MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95      1200
           1       0.96      0.96      0.96      1200
           2       0.97      0.98      0.98      1200
           3       0.98      0.93      0.95      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [54]:
# Import the spaCy library for natural language processing
import spacy

# Load the small English language model in spaCy and create an NLP object
nlp = spacy.load("en_core_web_sm")

# Define a function to preprocess text by removing stop words and punctuation, and lemmatizing the tokens
def preprocess(text):
    doc = nlp(text)               # Process the text to create a spaCy document object
    filtered_token = []           # Initialize an empty list to store the filtered tokens
    for token in doc:             # Loop through each token in the document
        if token.is_stop or token.is_punct:  # Skip stop words and punctuation
            continue
        filtered_token.append(token.lemma_)  # Append the lemmatized form of the token to the list
    return " ".join(filtered_token)          # Join the tokens into a single string and return it

In [58]:
# Apply the preprocess function to the 'Text' column in the DataFrame and create a new column 'preprocessed_txt'
df["preprocessed_txt"] = df["Text"].apply(preprocess)

In [63]:
# Display the first few rows of the DataFrame with the preprocessed text
df.head()

Unnamed: 0,Text,label,label_num,preprocessed_txt
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2,Indira Designer Women Art Mysore Silk Saree Bl...


In [64]:
# Display the original text of the first row in the 'Text' column
df.Text[0]

'Urban Ladder Eisner Low Back Study-Office Computer Chair(Black) A study in simple. The Eisner study chair has a firm foam cushion, which makes long hours at your desk comfortable. The flexible meshed back is designed for air-circulation and support when you lean back. The curved arms provide ergonomic forearm support. Adjust the height using the gas lift to find that comfortable position and the nylon castors make it easy to move around your space. Chrome legs refer to the images for dimension details any assembly required will be done by the UL team at the time of delivery indoor use only.'

In [65]:
# Display the preprocessed text of the first row in the 'preprocessed_txt' column
df.preprocessed_txt[0]

'Urban Ladder Eisner low Study Office Computer Chair(Black study simple Eisner study chair firm foam cushion make long hour desk comfortable flexible mesh design air circulation support lean curved arm provide ergonomic forearm support adjust height gas lift find comfortable position nylon castor easy space chrome leg refer image dimension detail assembly require UL team time delivery indoor use'

In [66]:
# Split the data into training and testing sets using the preprocessed text and numerical labels
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_txt,  # Input features (preprocessed text)
    df.label_num,         # Target labels (numerical labels)
    test_size=0.2,        # 20% of the data will be used for testing
    random_state=2022,    # Set random seed for reproducibility
    stratify=df.label_num # Ensure the split maintains the distribution of labels
)

# Create a pipeline with TF-IDF vectorization and Random Forest classification using the preprocessed text
clf = Pipeline([
    ("vectorizer_tfidf", TfidfVectorizer()),    # Step 1: Convert preprocessed text to TF-IDF features
    ("Random Forest", RandomForestClassifier()) # Step 2: Apply Random Forest classification
])

# Fit the pipeline on the training data
clf.fit(X_train, y_train)

# Predict the labels for the testing data
y_pred = clf.predict(X_test)

# Print the classification report to evaluate the performance of the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1200
           1       0.97      0.98      0.98      1200
           2       0.98      0.99      0.99      1200
           3       0.98      0.98      0.98      1200

    accuracy                           0.98      4800
   macro avg       0.98      0.98      0.98      4800
weighted avg       0.98      0.98      0.98      4800

