In [8]:
import opendatasets as od
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import json

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
def load_dataset():
    od.download("https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification")
    csv_path = "ecommerce-text-classification/ecommerceDataset.csv"
    df = pd.read_csv(csv_path)
    df = df.iloc[:, :2]
    df.columns = ["category", "description"]
    df.dropna(inplace=True)
    df["description"] = df["description"].astype(str)
    df["category"] = df["category"].astype(str)
    return df


def preprocess_text(df):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    def clean_text(text):
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        words = text.split()
        words = [w for w in words if w not in stop_words]
        words = [lemmatizer.lemmatize(w) for w in words]
        text = ' '.join(words)
        return text

    df['clean_description'] = df['description'].apply(clean_text)
    return df
import numpy as np

def compute_tfidf(corpus):
    docs = [doc.split() for doc in corpus]
    vocab = sorted(list(set([w for doc in docs for w in doc])))
    N = len(docs)
    vocab_index = {word: idx for idx, word in enumerate(vocab)}

    tf = np.zeros((N, len(vocab)))
    df = np.zeros(len(vocab))

    for doc_idx, doc in enumerate(docs):
        word_counts = {}
        for word in doc:
            word_counts[word] = word_counts.get(word, 0) + 1
        for word, count in word_counts.items():
            idx = vocab_index[word]
            tf[doc_idx, idx] = count / len(doc)
            df[idx] += 1

    idf = np.log(N / (df + 1))
    tfidf = tf * idf

    return tfidf, vocab

def train_word2vec(corpus, vector_size=75, window=5, min_count=1, workers=4):
  sentences = [doc.split() for doc in corpus]
  model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
  return model



In [3]:
df = load_dataset()

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: alirezasaffar
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification
Downloading ecommerce-text-classification.zip to ./ecommerce-text-classification


100%|██████████| 7.86M/7.86M [00:00<00:00, 1.02GB/s]







In [4]:
new_df = preprocess_text(df)

In [5]:
new_df.to_csv("cleaned_dataset.csv", index=False)

In [6]:
tfidf, vocab=compute_tfidf(new_df)
model=train_word2vec(new_df)
w2v_model = train_word2vec(new_df['clean_description'], vector_size=75)

In [9]:
w2v_model.save("word2vec.model")


np.save('tfidf.npy', tfidf)
with open('vocab.json', 'w') as f:
    json.dump(vocab, f)

In [None]:
#how to restore : 
#from gensim.models import Word2Vec
# w2v_model = Word2Vec.load("word2vec.model")

# tfidf_loaded = np.load('tfidf.npy')
# with open('vocab.json', 'r') as f:
#     vocab_loaded = json.load(f)