# Clean scraped amazon data with 'CleanAmazonData' package

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np 
import os

from CleaningAmazonData import CleanDescriptionFile, CleanReviewFile


In [2]:
# Creating dataframe for description and review table
path = r'C:\Users\Lajar\OneDrive\CrowdDoing\Research\Revised_data\Scrapy_Data'

desc_df = pd.read_csv(os.path.join(path,'1_st_johns_wort_description.csv'))
review_df = pd.read_csv(os.path.join(path,'1_st_johns_wort_review.csv'))

desc_df.shape, review_df.shape

((2106, 22), (3155, 17))

## Cleaning Description file 

In [3]:
desc_df.isnull().sum()

ASIN                        0
Domain                      0
ExcludeTerm                 0
ExclusionInProduct          0
IngredientInProduct         0
Keyword                     0
KeywordDept                 0
MatchTerm                   0
ProductDescription          0
ProductDirections        1794
ProductFlavor               0
ProductIngredientList    1129
ProductLegal              459
ProductLink                 0
ProductName                 0
ProductRatings              0
ProductSafety            1861
ProductStar                 0
ProductSummary            343
RetrievedTime               0
ReviewsLink                 0
UPC                         0
dtype: int64

In [4]:
# Create instance of CleanDescriptionFile
cdf = CleanDescriptionFile(check_ASIN = True)

# Find invalid ASIN if any in description file 
invalid_ASIN = cdf.check_ASIN_validity(desc_df)

2003    ref=sr_1_1158?dchild=1&keywords=st+johns+wort&...
dtype: object


Note: Analyse the invalid_ASIN array. Try to correct it if possible or remove rows. 

In [5]:
# transform raw description df to cleaned and featured df 
cleaned_desc_df = cdf.transform(desc_df)
cleaned_desc_df.head()

Unnamed: 0,ASIN,Domain,ExcludeTerm,ExclusionInProduct,IngredientInProduct,Keyword,KeywordDept,MatchTerm,ProductDescription,ProductDirections,...,ProductName,ProductRatings,ProductSafety,ProductStar,ProductSummary,RetrievedTime,ReviewsLink,UPC,RetrivedTime,Category
0,B06WWFMCL4,amazon.com,"skin & hair oil, no_exclusion_string",0,1,st johns wort,1,"st johns wort, johns wort, john's wort",15 million Americans are affected by some form...,"As a dietary supplement, take two (2) capsules...",...,"Anti Anxiety Supplement 900mg With Gaba, L-The...",1585,Not Available,4.4,Not Available,2020-06-15,/Supplement-L-Theanine-Ashwagandha-Magnesium-C...,No_UPC,2020-06-15,supplement
1,B00GB85JR4,amazon.com,"skin & hair oil, no_exclusion_string",0,0,st johns wort,1,"st johns wort, johns wort, john's wort",Vitamin D can be found in cells throughout the...,It's normal for gel capsules to get soft and s...,...,"NatureWise Vitamin D3 5,000 IU (1 Year Supply)...",17059,Vitamin D toxicity can result from regular exc...,4.7,MOST ACTIVE FORM: NatureWise Vitamin D3 in cer...,2020-06-15,/NatureWise-Vitamin-Function-Cold-Pressed-Glut...,858081006042,2020-06-15,oil
2,B06XC9CZWN,amazon.com,"skin & hair oil, no_exclusion_string",0,0,st johns wort,1,"st johns wort, johns wort, john's wort",Our goal is to help you be the best version of...,Not Available,...,Havasu Nutrition Ashwagandha Capsules Formulat...,1760,These statements have not been evaluated by th...,4.4,PREMIUM STRESS FIGHTER - Our natural Ashwagand...,2020-06-15,/Havasu-Nutrition-Premium-Ashwagandha-1000mg/p...,045924591333,2020-06-15,tablet
3,B00TO7J3KS,amazon.com,"skin & hair oil, no_exclusion_string",0,0,st johns wort,1,"st johns wort, johns wort, john's wort",5-HTP is a precursor to the neurotransmitter s...,Not Available,...,NatureWise 5-HTP 100mg | Natural Mood & Sleep ...,1500,This product is labelled to United States stan...,4.5,Designed to give an added boost to your weight...,2020-06-15,/NatureWise-Appetite-Enhanced-Vegetarian-Packa...,858081006097,2020-06-15,other
4,B06ZYHJYD5,amazon.com,"skin & hair oil, no_exclusion_string",0,0,st johns wort,1,"st johns wort, johns wort, john's wort",delivers 2100 mg per serving of organic ashwag...,Not Available,...,"Organic Ashwagandha 2,100 mg - 100 Vegan Capsu...",3794,Not Available,4.5,Not Available,2020-06-15,/Organic-Ashwagandha-1300mg-Capsules-Enhancer/...,804540020746,2020-06-15,tablet


Note: Resulting dataframe is cleaned dataframe with feature included 'Category'

In [6]:
cleaned_desc_df.shape

(1928, 24)

In [7]:
cleaned_desc_df.isnull().sum()

ASIN                     0
Domain                   0
ExcludeTerm              0
ExclusionInProduct       0
IngredientInProduct      0
Keyword                  0
KeywordDept              0
MatchTerm                0
ProductDescription       0
ProductDirections        0
ProductFlavor            0
ProductIngredientList    0
ProductLegal             0
ProductLink              0
ProductName              0
ProductRatings           0
ProductSafety            0
ProductStar              0
ProductSummary           0
RetrievedTime            0
ReviewsLink              0
UPC                      0
RetrivedTime             0
Category                 0
dtype: int64

## Cleaning Review File

In [8]:
# Create instance of CleanReviewFile
crf = CleanReviewFile(check_ASIN = True)

# Check for invalid ASIN if any
invalid_ASIN = crf.check_ASIN_validity(review_df)

Series([], dtype: object)


In [9]:
# trnsform raw review_df to cleaned_review_df with additional feature 'ProcessedText'
cleaned_review_df = crf.transform(review_df)

In [10]:
cleaned_review_df.ProcessedText

0       go start review tell recover addict alcoholic ...
1       wanna cry even tell u much help use benzoyl pe...
2       take daily year positive impact overall mood j...
3       wait week post make sure though saw improvemen...
4       product life saver since break prescription cy...
                              ...                        
3150    exactly need brand guarantee potency take edge...
3151                               perfect help attention
3152    like burt be products pricey mainly healthy in...
3153        take three months still feel effect relax say
3154    best st johns wort oil beat second best long d...
Name: ProcessedText, Length: 3155, dtype: object