## Second Data Pipeline
In this, the keyword extractor from the main pipline is reused on a different dataset

In [1]:
import pandas as pd
import numpy as np

### Read Product Data

In [2]:
# This dataset is not provided in the repository (replace the URL with your data)
file_path = "amazon_aspect_labeled_products.tsv" 
labelled_data = pd.read_csv(file_path)
labelled_data.drop(labelled_data.columns[0], axis=1, inplace=True)
labelled_data = labelled_data[["asin", "description", "title",]]
labelled_data

Unnamed: 0,asin,description,title
0,B000B5H26O,['Yorkshire Tea never compromise on the qualit...,"Taylors of Harrogate Yorkshire Gold Tea, 160 T..."
1,B000F3WS7K,['We choose teas from our three favorite origi...,"Taylors of Harrogate Yorkshire Gold, 40 Teabag..."
2,B000HRS7OM,"['Made with a blend of teas from Assam, Africa...","Taylors of Harrogate Yorkshire Red, 160 Teabags"
3,B001E5DXY0,['We choose teas from our three favorite origi...,Taylors of Harrogate Yorkshire Gold Loose Leaf...
4,B004G90QE6,"['Made with a blend of teas from Assam, Africa...","Taylors of Harrogate Yorkshire Red, 240 Teabags"
...,...,...,...
9043,B01CMYK8W6,['What happens over the course of 18 months? W...,Tillamook Extra Sharp White Cheddar Cheese Age...
9044,B01DJXXR4U,"['Four simple, natural ingredients including o...",Tillamook Special Reserve Extra Sharp Cheddar ...
9045,B01FIV3SG8,['Tillamook Medium Cheddar Thick Sliced Cheese...,Tillamook Medium Cheddar Thick Sliced Cheese 1...
9046,B01FJFO2S6,['Tillamook Country Smoker Big Dad beef sticks...,Tillamook Country Smoker Big Dad Beef Stick 20...


### Data Pre-Processing

In [3]:
import import_ipynb
from ProjectModules import ColumnDropper, RowDropper, StringCleaner
from sklearn.pipeline import Pipeline

importing Jupyter notebook from ProjectModules.ipynb


In [4]:
print(f"Number of columns: {labelled_data.shape[1]} | Number of records= {labelled_data.shape[0]}")
print(f"Columns in labelled amazon dataset: {list(labelled_data.columns)}")

Number of columns: 3 | Number of records= 9048
Columns in labelled amazon dataset: ['asin', 'description', 'title']


In [5]:
labelled_data["clean_description"] = labelled_data["description"]
labelled_data_preprocess_pipeline = Pipeline([
    ("clean_string", StringCleaner(columns=['clean_description'])),
    ("drop_rows", RowDropper(columns=['clean_description']))
])
labelled_data = labelled_data_preprocess_pipeline.fit_transform(labelled_data)
print(f"Number of columns: {labelled_data.shape[1]} | Number of records= {labelled_data.shape[0]}")
print(f"Columns in amazon dataset: {list(labelled_data.columns)}")
labelled_data.head(2)

Number of columns: 4 | Number of records= 7963
Columns in amazon dataset: ['asin', 'description', 'title', 'clean_description']


Unnamed: 0,asin,description,title,clean_description
0,B000F3WS7K,['We choose teas from our three favorite origi...,"Taylors of Harrogate Yorkshire Gold, 40 Teabag...",we choose teas from our three favorite origins...
1,B000HRS7OM,"['Made with a blend of teas from Assam, Africa...","Taylors of Harrogate Yorkshire Red, 160 Teabags",made with a blend of teas from assam africa an...


### Keyword Extraction

In [6]:
import pickle

In [7]:
# Load existing keyword extractor and use it is compute TF-IDF scores
keyword_extractor = pickle.load(open("keyword_extractor", "rb"))
product_tfidf = keyword_extractor.compute_tfidf(labelled_data["clean_description"])
product_tfidf = keyword_extractor.eliminate_redundant_bigram()

### Keyword Mapper

In [8]:
keyword_mapper = pickle.load(open('keyword_mapper', 'rb'))
keyword_mapper.product_tfidf = keyword_extractor.product_tfidf
keyword_mapper.onto_tfidf = keyword_extractor.ontology_tf
keyword_mapper.map_keywords()

Unnamed: 0,product_idx,onto_idx,imp_score
0,0,180,1.0
1,7,84,1.0
2,7,83,1.0
3,7,82,1.0
4,7,81,1.0
...,...,...,...
15337,7944,240,1.0
15338,7944,239,1.0
15339,7944,238,1.0
15340,7944,237,1.0


### Integrate and Aggregate Data

In [9]:
keyword_mapper.integrate_ontology(keyword_mapper.ontology_data)

Unnamed: 0,product_idx,onto_idx,imp_score,tag,preference,preference category,association,conditional,association_imp_score
0,0,180,1.0,high in antioxidant,Products rich in antioxidants,Health,1,False,1.0
1,7,84,1.0,contain milk,Vegan products,Health,-1,True,-1.0
2,7,83,1.0,contain milk,Products from companies that support animal ri...,Social,-1,False,-1.0
3,7,82,1.0,contain milk,Lactose-free products,Health,-1,True,-1.0
4,7,81,1.0,contain milk,Allergen-free products,Health,-1,True,-1.0
...,...,...,...,...,...,...,...,...,...
15337,7944,240,1.0,no milk,Vegan products,Health,1,True,1.0
15338,7944,239,1.0,no milk,Products from companies that support animal ri...,Social,1,False,1.0
15339,7944,238,1.0,no milk,Lactose-free products,Health,1,True,1.0
15340,7944,237,1.0,no milk,Allergen-free products,Health,1,True,1.0


In [10]:
final_mapping = keyword_mapper.aggregate_mapping(by="preference category", conditional=False)
final_mapping.columns = [i[1] for i in final_mapping.columns]
final_mapping

Unnamed: 0_level_0,Environment,Health,Quality,Social
product_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,1.000000,,
7,,,,-1.000000
14,,1.000000,,
21,,0.185802,,-0.395994
23,0.800763,0.800763,,0.334182
...,...,...,...,...
7918,,0.794614,0.303557,
7927,,0.563403,,
7928,,,,-1.000000
7944,,,,1.000000


In [11]:
final_mapping[final_mapping.count(axis=1)>1].head(15)

Unnamed: 0_level_0,Environment,Health,Quality,Social
product_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21,,0.185802,,-0.395994
23,0.800763,0.800763,,0.334182
24,0.204943,0.340762,0.122738,
26,0.480286,0.447936,,0.267137
27,0.441434,0.441434,,0.441434
28,0.441434,0.441434,,0.441434
30,0.170049,0.211354,0.087571,0.295493
34,0.170049,0.211354,0.087571,0.295493
35,0.441434,0.441434,,0.441434
37,0.539677,0.503326,,0.503326


In [12]:
supplementary_info = keyword_mapper.aggregate_mapping(by="preference", conditional=True)
supplementary_info.columns = [i[1] for i in supplementary_info.columns]

supplementary_info.head(5)

Unnamed: 0_level_0,Allergen-free products,Gluten-free products,Lactose-free products,Vegan products,Vegetarian products
product_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,-1.0,,-1.0,-1.0,
18,,1.0,,,
21,-0.295215,,,-0.395994,-0.293779
23,-0.598981,0.800763,0.800763,-0.699872,0.800763
26,-0.456056,0.400109,0.60969,-0.450642,0.447936


In [13]:
vocabulary = pd.DataFrame()
vocabulary["keywords"] = keyword_extractor.get_vocab()
keywords = product_tfidf.tocsr().nonzero()
product_keywords = pd.DataFrame()
product_keywords["product_idx"]=keywords[0]
product_keywords["vocab_idx"]=keywords[1]
labelled_data = labelled_data.merge(product_keywords.merge(vocabulary, 
                                           left_on="vocab_idx", 
                                           right_index=True, 
                                           how="left").groupby(by="product_idx").agg({'keywords':', '.join}), left_index=True, right_index=True, how="left")
labelled_data

Unnamed: 0,asin,description,title,clean_description,keywords
0,B000F3WS7K,['We choose teas from our three favorite origi...,"Taylors of Harrogate Yorkshire Gold, 40 Teabag...",we choose teas from our three favorite origins...,high antioxidant
1,B000HRS7OM,"['Made with a blend of teas from Assam, Africa...","Taylors of Harrogate Yorkshire Red, 160 Teabags",made with a blend of teas from assam africa an...,
2,B001E5DXY0,['We choose teas from our three favorite origi...,Taylors of Harrogate Yorkshire Gold Loose Leaf...,we choose teas from our three favorite origins...,
3,B004G90QE6,"['Made with a blend of teas from Assam, Africa...","Taylors of Harrogate Yorkshire Red, 240 Teabags",made with a blend of teas from assam africa an...,
4,B004M31ZR8,"[""480 Yorkshire Tea 1 Cup Tea Bags. Why do peo...",Taylors of Harrogate Yorkshire Tea 480 Count,yorkshire tea cup tea bags why do people love ...,
...,...,...,...,...,...
7958,B01CMYK8W6,['What happens over the course of 18 months? W...,Tillamook Extra Sharp White Cheddar Cheese Age...,what happens over the course of months well le...,
7959,B01DJXXR4U,"['Four simple, natural ingredients including o...",Tillamook Special Reserve Extra Sharp Cheddar ...,four simple natural ingredients including our ...,high quality
7960,B01FIV3SG8,['Tillamook Medium Cheddar Thick Sliced Cheese...,Tillamook Medium Cheddar Thick Sliced Cheese 1...,tillamook medium cheddar thick sliced cheese o...,
7961,B01FJFO2S6,['Tillamook Country Smoker Big Dad beef sticks...,Tillamook Country Smoker Big Dad Beef Stick 20...,tillamook country smoker big dad beef sticks a...,


In [14]:
integrated_data = labelled_data.merge(final_mapping, left_index=True, right_index=True, how="left").merge(supplementary_info, left_index=True, right_index=True, how="left")
integrated_data.sample(5)

Unnamed: 0,asin,description,title,clean_description,keywords,Environment,Health,Quality,Social,Allergen-free products,Gluten-free products,Lactose-free products,Vegan products,Vegetarian products
7726,B015VKS8CA,"['', 'Take Your Favorite Fish to Another Level...",ORGANIC Gourmet No. 19 Salmon Blend &ndash; Ju...,take your favorite fish to another level the p...,,,,,,,,,,
2871,B01BNILYSE,['Now you can enjoy our signature Breyers Vani...,"Breyers Ice Cream, Natural Vanilla Snack Cups,...",now you can enjoy our signature breyers vanill...,"natural flavor, non gmo, rainforest alliance, ...",0.264085,0.243941,,0.590057,,,,,
4027,B00514FGJU,"[""Cascadian Farm Organic Sweet and Salty Chewy...",Cascadian Farm Organic Sweet and Salty Chewy G...,cascadian farm organic sweet and salty chewy g...,"contain soy, natural flavor, organic food",0.760571,0.426981,,,-0.173569,,,,
5753,B00E1YZUH4,['Nutmeg is the kernel of a fruit resembling a...,"Badia Baking Nutmeg Whole, 16 Ounce",nutmeg is the kernel of a fruit resembling an ...,,,,,,,,,,
2989,B0083FX61Y,"['All Natural, No MSG Added, Liquid Miso', 'Be...","Hikari Minute Miso, 10-Ounce",all natural no msg added liquid miso be carefu...,,,,,,,,,,


In [15]:
sample_index = 7544
print(f"Sample Index: {sample_index}")
print(f"Product details for the considered sample: \nTitle: {labelled_data.iloc[sample_index]['title']}")
print(f"Description: {labelled_data.iloc[sample_index]['clean_description']}")
print(f"Keyword Identified: {keyword_extractor.get_vocab()[product_tfidf[sample_index].indices]}")
keyword_mapper.ontology_data.iloc[keyword_mapper.mapping[keyword_mapper.mapping["product_idx"]==sample_index]["onto_idx"]]

Sample Index: 7544
Product details for the considered sample: 
Title: Gourmet Chocolate No Bake Cookies 24 Hand-Made, Hand-Wrapped Gift Boxed, Made to Order, Fresh, FREE Expedited Shipping, Perfect for Holidays, Gifts, Showers, Special Occasions, No Occasion (24 Non Gift Boxed Cookies)
Description: i have been making these chocolate no bake cookies for years after raving over them for years our family and friends finally convinced us to make them available to the public so after much consideration we decided to go for it we will make your cookies fresh and carefully hand wrap each one each cookie is hand dipped into our molds which gives them the unique shape we use only quality ingredients including real butter raw sugar real pure vanilla for those with food allergies please be aware that there are both dairy and peanut product in our cookies there are no preservatives or funky additives in our cookies they are made fresh to enjoy now they are rich and smooth and oh so chocolaty it tr

Unnamed: 0_level_0,tag,preference,preference category,association,conditional
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26,allergen peanut product,High-protein products,Health,1,False
25,allergen peanut product,Allergen-free products,Health,-1,True
245,no preservative,Products without preservatives,Health,1,False
243,no milk beside additive,Vegan products,Health,1,True
242,no milk beside additive,Products from companies that support animal ri...,Social,1,False
241,no milk beside additive,Lactose-free products,Health,1,True
236,no meat beside additive,Vegetarian products,Health,1,True
235,no meat beside additive,Vegan products,Health,1,True
234,no meat beside additive,Products from companies that support animal ri...,Social,1,False
232,no gluten beside additive,Gluten-free products,Health,1,True


In [16]:
sample_index = 479
print(f"Sample Index: {sample_index}")
print(f"Product details for the considered sample: \nTitle: {labelled_data.iloc[sample_index]['title']}")
print(f"Description: {labelled_data.iloc[sample_index]['clean_description']}")
print(f"Keyword Identified: {keyword_extractor.get_vocab()[product_tfidf[sample_index].indices]}")
keyword_mapper.ontology_data.iloc[keyword_mapper.mapping[keyword_mapper.mapping["product_idx"]==sample_index]["onto_idx"]]

Sample Index: 479
Product details for the considered sample: 
Title: Simply Organic Red Pepper Crushed Certified Organic, 0.42-Ounce (Pack of 6)
Description: simply organic red pepper crushed comes in ounce containers a must for mexican and african cooking chili flakes will liven up any dish use sparingly in marinara and pizza sauces and on grilled vegetables and meats chilies can be irritating to the eyes and skin so handle with caution all information is provided on a per serving basis for the unprepared product frontier handles milk and egg products and does not test finished goods for the presence of allergens unless otherwise specified as certified organic crushed red pepper statements regarding dietary supplements have not been evaluated by the fda and are not intended to diagnose treat cure or prevent any disease or health condition
Keyword Identified: ['certified organic' 'egg product' 'milk product' 'not test']


Unnamed: 0_level_0,tag,preference,preference category,association,conditional
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
192,isnt proven not to test on animal,Products from companies that support animal ri...,Social,-1,False
17,allergen milk of mammal and dairy product incl...,Allergen-free products,Health,-1,True
6,allergen egg of poultry product,Allergen-free products,Health,-1,True
49,certified organic,Sustainably farmed products,Environment,1,False
48,certified organic,Organic Product,Health,1,False
47,certified organic,Products evaluated with auditing processes tha...,Social,1,False


In [18]:
print(f"Final Shape of the Data = {integrated_data.shape}")

Final Shape of the Data = (7963, 14)
