In [1]:
import pandas as pd
import re
import os
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import kaggle

In [3]:

# api.dataset_download_files('lokeshparab/amazon-products-dataset', path="Cycling.csv")
# !kaggle datasets download -d lokeshparab/amazon-products-dataset -f cycling.csv

In [4]:
# from kaggle.api.kaggle_api_extended import KaggleApi
# api = KaggleApi()
# api.authenticate()


In [5]:
# api.datasets_download_file('lokeshparab', 'amazon-products-dataset/', 
#                            file_name='cycling.csv')

In [6]:
def acquire_amazon():
    '''
    This will acqurie the Cycling category Amazon product data from either local cached
    file or from kaggle.com
    
    returns: uncleaned dataframe
    '''
    # set filename
    filename = 'amazon_cycling.csv'
    # check if local cached version of the file exists
    if os.path.exists(filename):
        # display status message
        print(f'Opening local {filename} file')
        # open local file data
        df = pd.read_csv(filename)
    # if there is no local file
    else:
        # display status message
        print(f'Local file {filename} not found')
        print('downloading data')
        # set url path to the dataset
        path = '''https://storage.googleapis.com/kagglesdsdata/datasets/3020336/5239462/Cycling.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230601%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230601T202559Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=609985cbe62280925ce349da43238b08dbf97feeee9055c7cdb98617052093e28ff6a4060b615d16f53fd2246ae70e0baeb950a2770e327998a184339d881b88bc0cfbfb2bde36af0544af4ada38bbff28f84957b8f48cc9a1bad4e15bba19190b9b992d475d4e80f8568dfd8d95b6b9c89bb60e8f75eaf79068e4ad36bab6c9bd69971f0c6d5b101e72684b407b88490c1471ff4a94540668165830c302eb3128389382028d84b6b438901e81f51a61c67e9dd6da74c0d4f2028582533573c808ab1218a5924a2d071bad89171fbaf634ce225b68775a0f193ea8c3230e19dce835467a56ac894db017586defae68bc8c175d3655edfcd73997e635c77932a2'''
        # read the data from the url path
        df = pd.read_csv(path)
        # cache the data to local csv file
        df.to_csv(filename, index=False)
    # return the dataframe
    return df

In [7]:
def basic_clean(original_string):
    '''
    This will take in a string, make it all lowercase, normalize the characters to ascii
    and remove characters that are not letters, numbers or spaces
    '''
    original_string = original_string.replace('|',' ').replace('/', ' ')
    # normalize the characters to ascii standart
    normalized = unicodedata.normalize('NFKD', original_string).\
        encode('ascii', 'ignore').decode('utf-8')
    # lowercase all the words in the data
    lowered = normalized.lower()
    # remove things that arent letters, numbers and spaces
    basic_cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', lowered)
    #return the cleaned string
    return basic_cleaned

In [8]:
def tokenize(basic_cleaned):
    '''
    This will break up words into smaller, discrete (tokenized) units
    '''
    # grab our tokenizer from nltk
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # tokenize the data
    tokenized = tokenizer.tokenize(basic_cleaned, return_str=True)
    # return the tokenized data
    return tokenized

In [9]:
def lemmatize(tokenized):
    '''
    This will cut a string of words down into their root words (Lemmatizing)
    '''
    # create lemmatizer object
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # lemmatize every word in the string
    lemmatized = ' '.join([lemmatizer.lemmatize(word) for word in tokenized.split()])
    # return the lemmatized string
    return lemmatized

In [10]:
def remove_stopwords(string, extra_words=None, exclude_words=None):
    '''
    This will remove words that hold little meaning to a machine learning system
    such as: 'the' 'am', 'is', 'are',
    '''
    # get a list of the stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    # add extra words to the stopwords list
    if extra_words:
        [stopwords.append(word) for word in extra_words]
    # remove the exclude words from the stopwords list if word is in the stopwords list
    if exclude_words:
        [stopwords.remove(word) for word in exclude_words 
                     if (word in stopwords)]
    # get the list of words that are not in the stopwords
    stops_removed = ' '.join([word for word in string.split() 
                              if word not in stopwords])
    # return the words not in the stopwords list
    return stops_removed

In [11]:
def clean_names(df, extra_words=None, exclude_words=None):
    '''
    This will clean/normalize, tokenize and lemmatize the product names from amazon in 
    preparation of using nlp on it.
    '''
    # create an empty list to store the names
    names = []
    # cycle through all the product names
    for name in df.name:
        # clean the product name
        basic_cleaned = basic_clean(name)
        # tokenize the words in the name
        tokenized = tokenize(basic_cleaned)
        # remove the stopwords from the product name
        cleaned = remove_stopwords(tokenized, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
        # get the lemmatized words from the product name
        lemmatized = lemmatize(cleaned)
        # add the lemmatized name to the list of names
        names.append(lemmatized)
        
    # add the lemmatized version of the name to the original df
    df['name_preped'] = names
    # return the df
    return df

In [12]:
def prepare_amazon(df, extra_words=None, exclude_words=None):
    '''
    This will clean/normalize/tokenize/lemmatize the name column of the amazon products,
    it will also replace some nonsense values in the ratings columns with 0s and 
    remove Indian rupee symbol. It will then convert the correct dtypes,
    and convert the prices from Indian rupee into US dollars. Then it will
    create new columns for the discount_amount and rating_prod
    '''
    # clean/tokenize/lemmatize the product names so we can use regex on them later
    cleaned = clean_names(df, extra_words=None, exclude_words=None)
    
    # change nonsense rating values into 0s and convert dtypes
    cleaned.ratings = cleaned.ratings.str.strip().\
        str.replace(' ', '', regex=False).\
        str.replace(',','', regex=False).\
        str.replace('Get', '0', regex=False).\
        fillna('0').astype(float)
    cleaned.no_of_ratings = cleaned.no_of_ratings.str.strip().\
        str.replace(' ', '', regex=False).\
        str.replace(',','', regex=False).\
        str.replace('FREEDeliverybyAmazon', '0', regex=False).\
        str.replace('Only1leftinstock.', '0', regex=False).\
        str.replace('Only2leftinstock.', '0', regex=False).\
        fillna('0').astype(int)
    
    # remove rows without actual prices since they are probably not available
    cleaned = cleaned[cleaned.actual_price.isna() == False]
    # clean the prices and convert to float dtype
    cleaned.discount_price = cleaned.discount_price.str.strip().\
        str.replace('₹','', regex=False).\
        str.replace(',','', regex=False).astype(float)
    cleaned.actual_price = cleaned.actual_price.str.strip().\
        str.replace('₹','', regex=False).\
        str.replace(',','', regex=False).astype(float)
    # if there is no discount_price we will assume the actual_price
    cleaned.discount_price = cleaned.discount_price.fillna(cleaned.actual_price)
    
    # convert the prices from Indian rupees into US dollars (1 rupee = 0.012 dollars)
    cleaned.actual_price = round(cleaned.actual_price * 0.012, 2)
    cleaned.discount_price = round(cleaned.discount_price * 0.012, 2)
    
    # lets create a new column with the amount of discount
    cleaned['discount_amount'] = cleaned.actual_price - cleaned.discount_price
    # create a new column that is the product of avg ratings and no_of_ratings
    # so that we can get a rankings of all the products
    cleaned['prod_rating'] = cleaned.ratings * cleaned.no_of_ratings
    
    # rename columns to avoid confustion
    cleaned = cleaned.rename(columns={'main_category':'amazon_main_cat',
                                      'sub_category': 'amazon_sub_cat'})
    # reset the index after dropping null rows
    cleaned = cleaned.reset_index().drop(columns='index')
    
    # return the cleaned dataframe
    return cleaned

In [13]:
def wrangle_products():
    '''
    This will perform the acquisitioin and preparing of the amazon product info
    '''
    # acquire amazon product data
    df = acquire_amazon()
    # prepare the data
    cleaned = prepare_amazon(df)
    # return the prepared data
    return cleaned

In [14]:
def get_cat_list(cat_list, extra_words=None, exclude_words=None):
    cats = []
    # cycle through all the product cats
    for cat in cat_list:
        # clean the product cat
        basic_cleaned = basic_clean(cat)
        # tokenize the words in the cat
        tokenized = tokenize(basic_cleaned)
        # remove the stopwords from the product cat
        cleaned = remove_stopwords(tokenized,
                                   extra_words=extra_words,
                                   exclude_words=exclude_words)
        # get the lemmatized words from the product cat
        lemmatized = lemmatize(cleaned)
        # add the lemmatized cat to the list of cats
        cats.append(lemmatized)
    # return a list of categories that are cleaned and lemmatized 
    # whiich we can match to our product names
    return cats

In [15]:
df = acquire_amazon()

Opening local amazon_cycling.csv file


In [16]:
cleaned = clean_names(df)
cleaned.head()

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped
0,QUXIS Portable High Pressure Foot Air Pump Hea...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/QUXIS-Portable-Activated...,4.0,7635,₹499,"₹1,299",quxis portable high pressure foot air pump hea...
1,Boldfit Gym Shaker for Protein Shake Leakproof...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Boldfit-Typhoon-Shaker-L...,4.1,8052,,₹229,boldfit gym shaker protein shake leakproof sha...
2,Techista 2-in-1 Rechargeable - Cycle Light (3 ...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Techista-2-1-Rechargeabl...,4.1,2214,₹339,₹999,techista 2in1 rechargeable cycle light 3 mode ...
3,"Nivia Plain Encounter Stockings (L, Black) - P...",sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Nivia-728LBW-Encounter-B...,4.1,2974,₹145,₹149,nivia plain encounter stocking l black polyest...
4,Eazo Steel Multipurpose Air Pump with needle (...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/EAZO-Steel-Multipurpose-...,4.1,3678,₹369,₹450,eazo steel multipurpose air pump needle use ca...


In [17]:
cleaned.name_preped.replace('|',' ')[13]

'leader scout mtb 26t mountain bicycle bike without gear single speed men sea green ideal 10 year frame siz'

In [18]:
cleaned.name[13]

'Leader Scout MTB 26T Mountain Bicycle/Bike Without Gear Single Speed for Men - Sea Green, Ideal for 10 + Years, Frame Siz...'

In [19]:
df = wrangle_products()

Opening local amazon_cycling.csv file


In [20]:
df.shape

(1117, 12)

In [21]:
df.head()

Unnamed: 0,name,amazon_main_cat,amazon_sub_cat,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped,discount_amount,prod_rating
0,QUXIS Portable High Pressure Foot Air Pump Hea...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/QUXIS-Portable-Activated...,4.0,7635,5.99,15.59,quxis portable high pressure foot air pump hea...,9.6,30540.0
1,Boldfit Gym Shaker for Protein Shake Leakproof...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Boldfit-Typhoon-Shaker-L...,4.1,8052,2.75,2.75,boldfit gym shaker protein shake leakproof sha...,0.0,33013.2
2,Techista 2-in-1 Rechargeable - Cycle Light (3 ...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Techista-2-1-Rechargeabl...,4.1,2214,4.07,11.99,techista 2in1 rechargeable cycle light 3 mode ...,7.92,9077.4
3,"Nivia Plain Encounter Stockings (L, Black) - P...",sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Nivia-728LBW-Encounter-B...,4.1,2974,1.74,1.79,nivia plain encounter stocking l black polyest...,0.05,12193.4
4,Eazo Steel Multipurpose Air Pump with needle (...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/EAZO-Steel-Multipurpose-...,4.1,3678,4.43,5.4,eazo steel multipurpose air pump needle use ca...,0.97,15079.8


In [22]:
cleaned.shape

(1152, 10)

In [23]:
cleaned.isna().sum()

name                0
main_category       0
sub_category        0
image               0
link                0
ratings           273
no_of_ratings     273
discount_price    178
actual_price       35
name_preped         0
dtype: int64

### If there are no ratings, then ratings and no_of_ratings will both be nan. So lets fill those nulls with 0.
### Ift there is no actual_price it could either be an data entry error or a product that is no longer available. If there is no actual_price, there will also be nulls in discount_price and discount_amount. I think we should look at these as not available, and we will not want to recommend products that are not available. So lets drop the nulls without prices.

In [24]:
cleaned = cleaned[cleaned.actual_price.isna() == False]

In [25]:
cleaned.head(2)

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped
0,QUXIS Portable High Pressure Foot Air Pump Hea...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/QUXIS-Portable-Activated...,4.0,7635,₹499,"₹1,299",quxis portable high pressure foot air pump hea...
1,Boldfit Gym Shaker for Protein Shake Leakproof...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Boldfit-Typhoon-Shaker-L...,4.1,8052,,₹229,boldfit gym shaker protein shake leakproof sha...


In [26]:
cleaned.shape

(1117, 10)

In [27]:
cleaned.discount_price = cleaned.discount_price.\
    str.replace('₹','', regex=False).\
    str.replace(',','', regex=False).astype(float)

In [28]:
cleaned.actual_price = cleaned.actual_price.\
    str.replace('₹','', regex=False).\
    str.replace(',','', regex=False).astype(float)

In [29]:
cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1117 entries, 0 to 1151
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1117 non-null   object 
 1   main_category   1117 non-null   object 
 2   sub_category    1117 non-null   object 
 3   image           1117 non-null   object 
 4   link            1117 non-null   object 
 5   ratings         856 non-null    object 
 6   no_of_ratings   856 non-null    object 
 7   discount_price  974 non-null    float64
 8   actual_price    1117 non-null   float64
 9   name_preped     1117 non-null   object 
dtypes: float64(2), object(8)
memory usage: 96.0+ KB


In [30]:
cleaned.discount_price = cleaned.discount_price.fillna(cleaned.actual_price)

In [31]:
cleaned['discount_amount'] = cleaned.actual_price - cleaned.discount_price

In [32]:
cleaned.discount_price.value_counts()

499.00      43
299.00      35
399.00      34
599.00      29
199.00      28
            ..
560.82       1
4829.00      1
254.00       1
7605.00      1
14250.00     1
Name: discount_price, Length: 519, dtype: int64

In [33]:
cleaned.head(3)

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped,discount_amount
0,QUXIS Portable High Pressure Foot Air Pump Hea...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/QUXIS-Portable-Activated...,4.0,7635,499.0,1299.0,quxis portable high pressure foot air pump hea...,800.0
1,Boldfit Gym Shaker for Protein Shake Leakproof...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Boldfit-Typhoon-Shaker-L...,4.1,8052,229.0,229.0,boldfit gym shaker protein shake leakproof sha...,0.0
2,Techista 2-in-1 Rechargeable - Cycle Light (3 ...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Techista-2-1-Rechargeabl...,4.1,2214,339.0,999.0,techista 2in1 rechargeable cycle light 3 mode ...,660.0


In [34]:
cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1117 entries, 0 to 1151
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             1117 non-null   object 
 1   main_category    1117 non-null   object 
 2   sub_category     1117 non-null   object 
 3   image            1117 non-null   object 
 4   link             1117 non-null   object 
 5   ratings          856 non-null    object 
 6   no_of_ratings    856 non-null    object 
 7   discount_price   1117 non-null   float64
 8   actual_price     1117 non-null   float64
 9   name_preped      1117 non-null   object 
 10  discount_amount  1117 non-null   float64
dtypes: float64(3), object(8)
memory usage: 104.7+ KB


In [35]:
cleaned.no_of_ratings = cleaned.no_of_ratings.str.strip().str.replace(',','',).\
    str.replace(' ','')

In [36]:
cleaned.head(2)

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped,discount_amount
0,QUXIS Portable High Pressure Foot Air Pump Hea...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/QUXIS-Portable-Activated...,4.0,7635,499.0,1299.0,quxis portable high pressure foot air pump hea...,800.0
1,Boldfit Gym Shaker for Protein Shake Leakproof...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Boldfit-Typhoon-Shaker-L...,4.1,8052,229.0,229.0,boldfit gym shaker protein shake leakproof sha...,0.0


In [37]:
# cleaned.no_of_ratings = cleaned.no_of_ratings.astype(float)

In [38]:
cleaned.no_of_ratings.unique()

array(['7635', '8052', '2214', '2974', '3678', '1444', '8334', '66',
       '566', '6580', '5117', '1601', '2759', '3457', '46', '632',
       '12842', '6596', '2507', '708', '1022', '1062', '768', '487',
       '268', '81', '447', '169', '424', '416', '485', '1', '23', '1145',
       '300', '2589', '252', '1270', '78', '144', '13', '65', '287',
       '4207', '203', '15', '121', '247', '870', '10244', '128', '1623',
       '316', '6083', '536', '26', '25', '2839', '180', '48', '29',
       '18123', '135', '1639', '1873', '157', '98', '240', '291', '1848',
       '2757', '1373', '16', '3', '241', '202', '75', '453', '2472', '24',
       '36', '951', '2', '4', '156', '125', '8', '51', '6', '383', '74',
       nan, '20', '50', '5', '119', '278', '18', '10', '14', '184', '133',
       '430', '159', '33', '9', '31', '12', '30', '64', '1111', '68',
       '104', '17', '267', '131', '54', '53', '39', '11', '255', '55',
       '130', '34', '59', '41', '40', '346', '149',
       'FREEDeliveryb

### FFS why is there 'FREEDeliverybyAmazon' in the no_of_ratings column. Let's check out these nonsense rows.

In [39]:
cleaned[cleaned.no_of_ratings == 'FREEDeliverybyAmazon']

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped,discount_amount
214,BODYCARE Unisex Kids Thermal Bottoms Pack Of 1,sports & fitness,Cycling,https://m.media-amazon.com/images/I/41RT9wzHoY...,https://www.amazon.in/BODYCARE-Unisex-Thermal-...,Get,FREEDeliverybyAmazon,625.0,625.0,bodycare unisex kid thermal bottom pack 1,0.0
245,Puma Footie Unisex Socks Pack of 3,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/PUMA-Footie-Unisex-Socks...,Get,FREEDeliverybyAmazon,599.0,599.0,puma footie unisex sock pack 3,0.0
261,Charged Active-001 Camo Jacquard Round Neck Sp...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Active-001-Jacquard-Petr...,Get,FREEDeliverybyAmazon,1196.0,1348.0,charged active001 camo jacquard round neck spo...,152.0
272,Van Heusen&comma;VH Woman Women's Relaxed Fit ...,sports & fitness,Cycling,https://m.media-amazon.com/images/I/519WKMByxw...,https://www.amazon.in/Van-Heusen-comma-Woman-I...,Get,FREEDeliverybyAmazon,1059.0,1059.0,van heusencommavh woman woman relaxed fit tshirt,0.0
412,"Adidas Unisex Mufc H So Socks (H13893_L, White...",sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Adidas-Unisex-Socks-H138...,Get,FREEDeliverybyAmazon,919.0,1499.0,adidas unisex mufc h sock h13893l white l,580.0
466,Charged Active-001 Camo Jacquard Round Neck Sp...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Active-001-Jacquard-Poly...,Get,FREEDeliverybyAmazon,1198.0,1198.0,charged active001 camo jacquard round neck spo...,0.0
482,Charged Active-001 Camo Jacquard Round Neck Sp...,sports & fitness,Cycling,https://m.media-amazon.com/images/I/81blA0UDQy...,https://www.amazon.in/Active-001-Polyester-Dar...,Get,FREEDeliverybyAmazon,1248.0,1248.0,charged active001 camo jacquard round neck spo...,0.0
483,Charged Active-001 Camo Jacquard Polyester Rou...,sports & fitness,Cycling,https://m.media-amazon.com/images/I/91nMeW8UoD...,https://www.amazon.in/Charged-Active-001-Jacqu...,Get,FREEDeliverybyAmazon,1198.0,1198.0,charged active001 camo jacquard polyester roun...,0.0
489,Charged Active-001 Camo Jacquard Round Neck Sp...,sports & fitness,Cycling,https://m.media-amazon.com/images/I/81Al5JUJ4s...,https://www.amazon.in/Charged-Active-001-Jacqu...,Get,FREEDeliverybyAmazon,1196.0,1348.0,charged active001 camo jacquard round neck spo...,152.0
510,ONN Men's Premium Thermal Bottom,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/ONN-Premium-Thermal-Bott...,Get,FREEDeliverybyAmazon,690.0,690.0,onn men premium thermal bottom,0.0


In [40]:
cleaned[cleaned.no_of_ratings == 'Only1leftinstock.']

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped,discount_amount
246,IndiaLot® Bicycle Tyre 20x 2.125/1.95/1.90/1.7...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/IndiaLot%C2%AE-Bicycle-T...,Get,Only1leftinstock.,599.0,675.0,indialot bicycle tyre 20x 2125 195 190 175 mul...,76.0
424,DIKUDI CREATION Doraemon Rider Infant to Toddl...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/DIKUDI-CREATION-Doraemon...,Get,Only1leftinstock.,1099.0,1599.0,dikudi creation doraemon rider infant toddler ...,500.0
1017,Dhara Kid Bicycle 3-5 Years Size 14T Unisex Cy...,sports & fitness,Cycling,https://m.media-amazon.com/images/I/51xytb8F7S...,https://www.amazon.in/Dhara-Bicycle-Unisex-Tub...,Get,Only1leftinstock.,3300.0,4999.0,dhara kid bicycle 35 year size 14t unisex cycl...,1699.0
1027,Layfuz 32T/34T/36T/38T Bike Chainring 104mm B ...,sports & fitness,Cycling,https://m.media-amazon.com/images/I/71Om35SiQ0...,https://www.amazon.in/Layfuz-Chainring-104mm-4...,Get,Only1leftinstock.,1039.9,2179.57,layfuz 32t 34t 36t 38t bike chainring 104mm b ...,1139.67


#### All these listings have 'Get' in the ratings category, seems like a data entry error

In [41]:
cleaned[cleaned.ratings == 'Get']

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped,discount_amount
214,BODYCARE Unisex Kids Thermal Bottoms Pack Of 1,sports & fitness,Cycling,https://m.media-amazon.com/images/I/41RT9wzHoY...,https://www.amazon.in/BODYCARE-Unisex-Thermal-...,Get,FREEDeliverybyAmazon,625.0,625.0,bodycare unisex kid thermal bottom pack 1,0.0
245,Puma Footie Unisex Socks Pack of 3,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/PUMA-Footie-Unisex-Socks...,Get,FREEDeliverybyAmazon,599.0,599.0,puma footie unisex sock pack 3,0.0
246,IndiaLot® Bicycle Tyre 20x 2.125/1.95/1.90/1.7...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/IndiaLot%C2%AE-Bicycle-T...,Get,Only1leftinstock.,599.0,675.0,indialot bicycle tyre 20x 2125 195 190 175 mul...,76.0
261,Charged Active-001 Camo Jacquard Round Neck Sp...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Active-001-Jacquard-Petr...,Get,FREEDeliverybyAmazon,1196.0,1348.0,charged active001 camo jacquard round neck spo...,152.0
272,Van Heusen&comma;VH Woman Women's Relaxed Fit ...,sports & fitness,Cycling,https://m.media-amazon.com/images/I/519WKMByxw...,https://www.amazon.in/Van-Heusen-comma-Woman-I...,Get,FREEDeliverybyAmazon,1059.0,1059.0,van heusencommavh woman woman relaxed fit tshirt,0.0
412,"Adidas Unisex Mufc H So Socks (H13893_L, White...",sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Adidas-Unisex-Socks-H138...,Get,FREEDeliverybyAmazon,919.0,1499.0,adidas unisex mufc h sock h13893l white l,580.0
424,DIKUDI CREATION Doraemon Rider Infant to Toddl...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/DIKUDI-CREATION-Doraemon...,Get,Only1leftinstock.,1099.0,1599.0,dikudi creation doraemon rider infant toddler ...,500.0
466,Charged Active-001 Camo Jacquard Round Neck Sp...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Active-001-Jacquard-Poly...,Get,FREEDeliverybyAmazon,1198.0,1198.0,charged active001 camo jacquard round neck spo...,0.0
482,Charged Active-001 Camo Jacquard Round Neck Sp...,sports & fitness,Cycling,https://m.media-amazon.com/images/I/81blA0UDQy...,https://www.amazon.in/Active-001-Polyester-Dar...,Get,FREEDeliverybyAmazon,1248.0,1248.0,charged active001 camo jacquard round neck spo...,0.0
483,Charged Active-001 Camo Jacquard Polyester Rou...,sports & fitness,Cycling,https://m.media-amazon.com/images/I/91nMeW8UoD...,https://www.amazon.in/Charged-Active-001-Jacqu...,Get,FREEDeliverybyAmazon,1198.0,1198.0,charged active001 camo jacquard polyester roun...,0.0


### The recommender system will be probably be using (ratings * no_of_ratings) to find which are the most popular products. If we fill these nonsense ratings with 0 then the products will be at the bottom of our recommendation list, which should be fine. So lets change the nonsence to 0s.

In [42]:
cleaned.ratings.unique()

array(['4.0', '4.1', '3.8', '3.9', '3.7', '3.6', '4.4', '4.3', '5.0',
       '4.2', '3.5', '2.7', '3.4', '3.3', '4.6', '3.2', '4.8', '3.0',
       '4.5', nan, '2.6', '1.0', '3.1', '2.8', '2.2', '2.9', 'Get', '4.7',
       '1.9', '2.0', '2.4', '2.5', '1.7', '2.1', '1.8', '2.3'],
      dtype=object)

In [43]:
cleaned.ratings = cleaned.ratings.str.replace('Get', '0', regex=False).\
    fillna('0').astype(float)

In [44]:
cleaned.no_of_ratings = cleaned.no_of_ratings.\
    str.replace('FREEDeliverybyAmazon', '0', regex=False).\
    str.replace('Only1leftinstock.', '0', regex=False).\
    str.replace('Only2leftinstock.', '0', regex=False).\
    fillna('0').astype(int)

In [45]:
cleaned.no_of_ratings.unique()

array([ 7635,  8052,  2214,  2974,  3678,  1444,  8334,    66,   566,
        6580,  5117,  1601,  2759,  3457,    46,   632, 12842,  6596,
        2507,   708,  1022,  1062,   768,   487,   268,    81,   447,
         169,   424,   416,   485,     1,    23,  1145,   300,  2589,
         252,  1270,    78,   144,    13,    65,   287,  4207,   203,
          15,   121,   247,   870, 10244,   128,  1623,   316,  6083,
         536,    26,    25,  2839,   180,    48,    29, 18123,   135,
        1639,  1873,   157,    98,   240,   291,  1848,  2757,  1373,
          16,     3,   241,   202,    75,   453,  2472,    24,    36,
         951,     2,     4,   156,   125,     8,    51,     6,   383,
          74,     0,    20,    50,     5,   119,   278,    18,    10,
          14,   184,   133,   430,   159,    33,     9,    31,    12,
          30,    64,  1111,    68,   104,    17,   267,   131,    54,
          53,    39,    11,   255,    55,   130,    34,    59,    41,
          40,   346,

In [46]:
cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1117 entries, 0 to 1151
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             1117 non-null   object 
 1   main_category    1117 non-null   object 
 2   sub_category     1117 non-null   object 
 3   image            1117 non-null   object 
 4   link             1117 non-null   object 
 5   ratings          1117 non-null   float64
 6   no_of_ratings    1117 non-null   int64  
 7   discount_price   1117 non-null   float64
 8   actual_price     1117 non-null   float64
 9   name_preped      1117 non-null   object 
 10  discount_amount  1117 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 104.7+ KB


### The prices in this dataset are listed in ₹, Indian Rupees
1 rupee is approximately 0.012 US dollars

Lets convert to US dollars since we are in USA and our other dataset uses dollars

In [47]:
cleaned.actual_price = round(cleaned.actual_price * 0.012, 2)
cleaned.discount_price = round(cleaned.discount_price * 0.012, 2)

In [48]:
cleaned.head()

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped,discount_amount
0,QUXIS Portable High Pressure Foot Air Pump Hea...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/QUXIS-Portable-Activated...,4.0,7635,5.99,15.59,quxis portable high pressure foot air pump hea...,800.0
1,Boldfit Gym Shaker for Protein Shake Leakproof...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Boldfit-Typhoon-Shaker-L...,4.1,8052,2.75,2.75,boldfit gym shaker protein shake leakproof sha...,0.0
2,Techista 2-in-1 Rechargeable - Cycle Light (3 ...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Techista-2-1-Rechargeabl...,4.1,2214,4.07,11.99,techista 2in1 rechargeable cycle light 3 mode ...,660.0
3,"Nivia Plain Encounter Stockings (L, Black) - P...",sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Nivia-728LBW-Encounter-B...,4.1,2974,1.74,1.79,nivia plain encounter stocking l black polyest...,4.0
4,Eazo Steel Multipurpose Air Pump with needle (...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/EAZO-Steel-Multipurpose-...,4.1,3678,4.43,5.4,eazo steel multipurpose air pump needle use ca...,81.0


## lets create a new column that is a product of the ratings and no_of_ratings to get an overalll rating_prod that we can use as a ranking of the products

In [49]:
cleaned['rating_prod'] = cleaned.ratings * cleaned.no_of_ratings

In [50]:
cleaned.head()

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped,discount_amount,rating_prod
0,QUXIS Portable High Pressure Foot Air Pump Hea...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/QUXIS-Portable-Activated...,4.0,7635,5.99,15.59,quxis portable high pressure foot air pump hea...,800.0,30540.0
1,Boldfit Gym Shaker for Protein Shake Leakproof...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Boldfit-Typhoon-Shaker-L...,4.1,8052,2.75,2.75,boldfit gym shaker protein shake leakproof sha...,0.0,33013.2
2,Techista 2-in-1 Rechargeable - Cycle Light (3 ...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Techista-2-1-Rechargeabl...,4.1,2214,4.07,11.99,techista 2in1 rechargeable cycle light 3 mode ...,660.0,9077.4
3,"Nivia Plain Encounter Stockings (L, Black) - P...",sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Nivia-728LBW-Encounter-B...,4.1,2974,1.74,1.79,nivia plain encounter stocking l black polyest...,4.0,12193.4
4,Eazo Steel Multipurpose Air Pump with needle (...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/EAZO-Steel-Multipurpose-...,4.1,3678,4.43,5.4,eazo steel multipurpose air pump needle use ca...,81.0,15079.8


In [51]:
# time to test the wrangle_products.py file

In [52]:
import wrangle_products

In [53]:
df2 = wrangle_products.wrangle_products()
df2.head()

Opening local amazon_cycling.csv file


Unnamed: 0,name,amazon_main_cat,amazon_sub_cat,image,link,ratings,no_of_ratings,discount_price,actual_price,name_preped,discount_amount,prod_rating
0,QUXIS Portable High Pressure Foot Air Pump Hea...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/QUXIS-Portable-Activated...,4.0,7635,5.99,15.59,quxis portable high pressure foot air pump hea...,9.6,30540.0
1,Boldfit Gym Shaker for Protein Shake Leakproof...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Boldfit-Typhoon-Shaker-L...,4.1,8052,2.75,2.75,boldfit gym shaker protein shake leakproof sha...,0.0,33013.2
2,Techista 2-in-1 Rechargeable - Cycle Light (3 ...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Techista-2-1-Rechargeabl...,4.1,2214,4.07,11.99,techista 2in1 rechargeable cycle light 3 mode ...,7.92,9077.4
3,"Nivia Plain Encounter Stockings (L, Black) - P...",sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Nivia-728LBW-Encounter-B...,4.1,2974,1.74,1.79,nivia plain encounter stocking l black polyest...,0.05,12193.4
4,Eazo Steel Multipurpose Air Pump with needle (...,sports & fitness,Cycling,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/EAZO-Steel-Multipurpose-...,4.1,3678,4.43,5.4,eazo steel multipurpose air pump needle use ca...,0.97,15079.8


In [54]:
sub_cat_list = ['Tires and Tubes', 'Gloves', 'Helmets', 'Bike Stands',
       'Mountain Bikes', 'Hydration Packs', 'Jerseys', 'Fenders',
       'Cleaners', 'Socks', 'Caps', 'Touring Bikes', 'Bottles and Cages',
       'Vests', 'Road Bikes', 'Bike Racks', 'Shorts']

In [55]:
cat_list = get_cat_list(sub_cat_list, extra_words=['bike', 'bikes'])

In [56]:
cat_list

['tire tube',
 'glove',
 'helmet',
 'stand',
 'mountain',
 'hydration pack',
 'jersey',
 'fender',
 'cleaner',
 'sock',
 'cap',
 'touring',
 'bottle cage',
 'vest',
 'road',
 'rack',
 'short']

In [57]:
tires_and_tubes_regex = r'.*tire.*' 

In [58]:
re.search(tires_and_tubes_regex, cleaned.name_preped[6])

In [59]:
horn_regex = r'.*horn.*'

In [60]:
re.search(horn_regex, cleaned.name_preped[6])
# match

<re.Match object; span=(0, 85), match='lista rechargeable bike horn light 140 db super b>

In [61]:
re.search(horn_regex, cleaned.name_preped[7])
# no match