In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
from bs4 import BeautifulSoup
import re

In [2]:
df = pd.read_csv('../Data/ecommerce.csv', names = ['label','desc'])

In [3]:
df

Unnamed: 0,label,desc
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou..."


In [4]:
df['label'].value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: label, dtype: int64

In [5]:
df.isna().sum()

label    0
desc     1
dtype: int64

In [6]:
df['desc'].iloc[11]

'Paper Plane Design Starry Night Vangoh Wall Art Canvas Painting. Large Size Rolled Canvas Art Print (36" X 48") We use only the most modern and efficient printing technology on our canvases, with only the best and original inks and precision Epson< Roland and HP printers. This innovative HD printing technique results in durable and spectacular looking prints of the highest quality that last a lifetime. We print solely with top-notch inks, to achieve brilliant and true colours. Due to their high level of UV Resistance, our Canvas prints retain their beautiful colours for many years. Our canvases contain high levels of white, to ensure that the colours of your original image are reproduced exactly, with brilliant tones. Add colour and style to your living space with this digitally printed canvas painting. Some gifts are for pleasure and some for eternal bliss.So bring home this elegant Canvas print that is lushed with Rich colors that makes it nothing but sheer elegance to be gifted to 

In [7]:
# Choose a specific row to find duplicates (e.g., row 7)
row_to_check = df.iloc[50424]

# Find rows that are duplicates of the selected row
duplicate_rows = df[(df['label'] == row_to_check['label']) & (df['desc'] == row_to_check['desc'])]

# Display the rows that are duplicates of the selected row
if not duplicate_rows.empty:
    print(f"Rows that are duplicates of row {row_to_check.name}:")
    print(duplicate_rows)
else:
    print(f"No duplicates found for row {row_to_check.name}")

Rows that are duplicates of row 50424:
             label                              desc
50400  Electronics  Micromax Canvas Win W121 (White)
50424  Electronics  Micromax Canvas Win W121 (White)


In [8]:
df = df.drop_duplicates()

In [9]:
df

Unnamed: 0,label,desc
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
50402,Electronics,Micromax Bharat 5 Plus Zero impact on visual d...
50403,Electronics,Microsoft Lumia 550 8GB 4G Black Microsoft lum...
50407,Electronics,"Microsoft Lumia 535 (Black, 8GB) Colour:Black ..."
50408,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...


In [10]:
df.duplicated().sum()

0

In [11]:
df.reset_index(inplace = True)

In [12]:
df = df.drop('index', axis = 1)

In [13]:
df['label'].value_counts()

Household                 10564
Books                      6256
Clothing & Accessories     5675
Electronics                5308
Name: label, dtype: int64

In [14]:
df['desc'].iloc[5]

'Pitaara Box Romantic Venice Canvas Painting 6mm Thick Mdf Frame 21.1 X 14Inch Enclosure Material:MDF Mount Frame                                                                                \xa0|\xa0                           Size:21.1inch x 14inch (53.6cms x 35.6cms)   SIZE: 21.1 inch x 14.0 inch (53.6 cms x 35.6 cms)  Enhance the beauty of your room walls with this breathtaking digital printed artwork. Our high-end printing technology captures every detail of the image in our prints on enhanced matte painting canvas, ensuring rich and lively colours. This wall art panel is mounted on MDF and ready to hang on walls. Beautiful interior home d©cor artwork gifts for Living, Dining Room, Outdoor, Gallery, Hotels, Restaurants, Office, Reception, Kitchen Area, Balcony and Bathroom.  Pitaara Box offers an exclusive collection of thousands of artworks, digital paintings, canvas prints, wall posters, and other wall decor products for your home, office, and surroundings. We provide a never-e

In [15]:
df['desc'] = df['desc'].str.replace(r'\xa0\|\xa0', '',regex = True)


In [16]:
df.isna().sum()

label    0
desc     1
dtype: int64

In [17]:
df = df.dropna()

In [18]:
def insert_space_after_full_stop(text):
    return text.replace('.', '. ')
df['desc'] = df['desc'].apply(insert_space_after_full_stop)

In [19]:
#Removing HTML tags from the whole dataset
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

df['desc'] = df['desc'].apply(remove_html_tags)

  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
def remove_urls(text):
    # Define a regular expression pattern to match URLs
    url_pattern = r'https?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)

df['desc']= df['desc'].apply(remove_urls)

In [21]:
df

Unnamed: 0,label,desc
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
27798,Electronics,Micromax Bharat 5 Plus Zero impact on visual d...
27799,Electronics,Microsoft Lumia 550 8GB 4G Black Microsoft lum...
27800,Electronics,"Microsoft Lumia 535 (Black, 8GB) Colour:Black ..."
27801,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...


In [22]:
def lemmatization(text):
    lm= WordNetLemmatizer()
    text = ' '.join([lm.lemmatize(word, pos='v') for word in text.split()])
    return text
df['desc']= df['desc'].apply(lemmatization)
df.head()

Unnamed: 0,label,desc
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [23]:
#Applying major transformation to text such as lowering text, removing stopwords, stemming, removing
#punctuations

ps = PorterStemmer();
def text_transform(text):
    text= text.lower()
    text = nltk.word_tokenize(text)
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
        
    
    return " ".join(y)

In [42]:
import time
start = time.time()
text_transform(df['desc'].iloc[0])
end = time.time()
print("Estimated time for total dataset :",((end-start)*len(df))/60, "Mins")

Estimated time for total dataset : 9.934484362602234 Mins


In [44]:
start = time.time()
df['desc'] = df['desc'].apply(text_transform)
end = time.time()
print("Time taken :",((end-start)/60, "Mins"))

Time taken : (3.9005133390426634, 'Mins')


In [45]:
df['desc'].iloc[5]

'pitaara box romant venic canva paint 6mm thick mdf frame 21 1 x 14inch enclosur materi mdf mount frame 1inch x 14inch 53 6cm x 35 6cm size 21 1 inch x 14 0 inch 53 6 cm x 35 6 cm enhanc beauti room wall breathtak digit print artwork print technolog captur everi detail imag print enhanc matt paint canva ensur rich live colour wall art panel mount mdf readi hang wall beauti interior home artwork gift live dine room outdoor galleri hotel restaur offic recept kitchen area balconi bathroom pitaara box offer exclus collect thousand artwork digit paint canva print wall poster wall decor product home offic surround provid rang creativ spectacular art product use gift everi occas everi season tag wall paint canva print modern art abstract design wallart artwork home bedroom dine live draw room digit print bathroom common area kitchen offic decor stretch stretch frame frame beauti classi royal special uniqu eleg stylish creativ afford best photo gift fabric balconi interior exterior outdoor gal

In [46]:
df.to_csv('../Data/CleanedEcommerce.csv', index = False)