# Waste Class Label Hierarchy

This notebook contains the process of determining the label hierarchy for the Recycle This image classification system.

---

---

### Imports & Config

In [1]:
# === Initial Imports === #
import pandas as pd
import numpy as np
import seaborn
import janitor

import os
from collections import Counter

In [2]:
# === Pandas display settings === #
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

---

## Trends Data

In [4]:
# === Load the category/trends data === #
df_trends = pd.read_csv("lists_export/recycle-pytrends.csv")
df_trends.head()

Unnamed: 0,date,recycle #1 Plastic,recycle #2 Plastic,recycle #4 Plastic,recycle #5 Plastic,recycle #6 Plastic,recycle #7 Plastic,recycle Acids,recycle Aerosol Cans - Full,recycle Alkaline Batteries,recycle Aluminum Foil,recycle Antifreeze,recycle Asphalt,recycle Auto Parts,recycle Branches,recycle Brick,recycle Brush,recycle CD Cases,recycle CDs,recycle Cables,recycle Candy Wrappers,recycle Car Batteries,recycle Carpet,recycle Cassette Tapes,recycle Cell Phones,recycle Chip Bags,recycle Christmas Trees,recycle Cigarettes,recycle Clothing,recycle Concrete,recycle Construction Materials,recycle Cooking Oil,recycle Cookware,recycle Corks,recycle Corrugated Cardboard,recycle Crayons,recycle DVDs,recycle Desktop Computers,recycle Dirt,recycle Doors,recycle Envelopes,recycle Eyeglasses,recycle Fabric,recycle Fire Extinguishers,recycle Floppy Disks,recycle Fluorescent Tubes,recycle Gasoline,recycle Gift Bags,recycle Hair,recycle Halogen Bulbs,recycle Hard Drives,recycle Hardware,recycle Heaters,recycle Inkjet Cartridges,recycle Laptop Computers,recycle Latex Paint,recycle Leaves,recycle Lithium Batteries,recycle Lumber,recycle Magazines,recycle Manure,recycle Mattresses,recycle Medications,recycle Microwaves,recycle Mixed Paper,recycle Motor Oil,recycle Nail Polish,recycle Newspaper,recycle Office Furniture,recycle Office Paper,recycle Oil Filters,recycle Pallets,recycle Paper Cups,recycle Paperback Books,recycle Paperboard,recycle Phone Books,recycle Pizza Boxes,recycle Plastic Cards,recycle Plastic Furniture,recycle Printers,recycle Propane Tanks,recycle Refrigerators,recycle Sand,recycle Shoes,recycle Small Appliances,recycle Smoke Detectors,recycle Steel Cans,recycle Stone,recycle Tablets,recycle Tennis Balls,recycle Tile,recycle Tires,recycle Toner Cartridges,recycle Tools,recycle Toothbrushes,recycle Toothpaste Tubes,recycle Toys,recycle Transmission Fluid,recycle Trophies,recycle Vehicles,recycle Video Tapes,recycle Water Filters,recycle Windows,recycle Wood,recycle Yard Waste
0,2015-01-25,0,34,0,52,58,39,0,30,30,20,45,25,50,0,27,18,18,39,24,0,21,55,12,48,5,4,4,23,31,23,15,0,0,15,0,32,21,0,35,20,20,27,17,0,23,37,0,56,19,31,62,0,31,40,25,25,21,31,91,0,17,26,32,0,28,0,43,0,0,40,18,36,41,0,13,20,0,48,34,14,22,32,24,9,9,19,19,0,0,18,42,58,25,19,0,33,17,17,0,0,1,77,28,24
1,2015-02-01,0,0,69,35,78,39,39,41,41,20,36,25,30,18,0,18,12,43,60,24,35,90,18,52,4,4,7,47,39,0,62,0,31,0,58,19,0,0,21,20,13,33,17,0,35,0,0,73,19,19,28,38,0,0,25,25,32,32,34,17,34,0,32,0,17,16,65,30,40,27,30,36,42,42,13,20,0,48,43,14,0,0,60,9,9,19,57,47,9,12,37,67,13,0,0,25,34,17,35,69,1,68,68,0
2,2015-02-08,0,0,0,69,0,0,0,20,47,0,27,42,40,0,25,18,0,39,0,0,21,49,12,48,0,0,0,32,27,23,31,0,16,0,19,19,0,21,0,40,40,27,17,17,17,0,0,76,19,19,31,38,32,41,0,50,21,21,40,0,17,0,0,0,26,16,38,0,54,27,36,18,83,0,0,0,48,0,39,29,22,43,21,14,0,0,19,23,0,18,33,46,12,12,0,39,17,0,0,34,1,62,58,49
3,2015-02-15,35,0,0,0,80,80,0,0,31,20,37,77,41,0,37,19,18,52,48,0,42,35,12,61,4,4,4,33,47,0,40,0,16,0,39,26,21,21,22,20,14,20,18,18,35,0,0,29,19,25,32,57,0,0,34,0,29,21,47,0,18,26,66,0,17,25,49,31,41,0,18,18,42,0,13,27,49,49,29,15,0,33,55,18,18,19,26,24,0,12,37,89,21,0,0,62,0,25,35,35,1,71,50,0
4,2015-02-22,0,0,0,0,38,0,0,20,40,60,27,25,40,0,28,18,12,67,24,24,28,62,18,64,3,0,0,44,31,0,23,0,15,0,38,29,0,0,42,40,27,26,0,0,29,0,0,67,0,19,50,37,0,40,0,25,21,21,51,0,23,26,64,0,17,16,53,30,53,27,36,0,0,41,26,20,0,48,48,58,22,43,45,9,18,0,19,23,0,0,21,66,12,0,0,50,17,0,35,34,1,66,56,25


In [5]:
# === Get list of categories === #
label_list = df_trends.columns.to_list()  # Get Python list of labels
label_list = label_list[1:]  # Drop the "date" index
label_list = [x[8:] for x in label_list]  # Remove the "recycle" query word

In [6]:
label_list

['#1 Plastic',
 '#2 Plastic',
 '#4 Plastic',
 '#5 Plastic',
 '#6 Plastic',
 '#7 Plastic',
 'Acids',
 'Aerosol Cans - Full',
 'Alkaline Batteries',
 'Aluminum Foil',
 'Antifreeze',
 'Asphalt',
 'Auto Parts',
 'Branches',
 'Brick',
 'Brush',
 'CD Cases',
 'CDs',
 'Cables',
 'Candy Wrappers',
 'Car Batteries',
 'Carpet',
 'Cassette Tapes',
 'Cell Phones',
 'Chip Bags',
 'Christmas Trees',
 'Cigarettes',
 'Clothing',
 'Concrete',
 'Construction Materials',
 'Cooking Oil',
 'Cookware',
 'Corks',
 'Corrugated Cardboard',
 'Crayons',
 'DVDs',
 'Desktop Computers',
 'Dirt',
 'Doors',
 'Envelopes',
 'Eyeglasses',
 'Fabric',
 'Fire Extinguishers',
 'Floppy Disks',
 'Fluorescent Tubes',
 'Gasoline',
 'Gift Bags',
 'Hair',
 'Halogen Bulbs',
 'Hard Drives',
 'Hardware',
 'Heaters',
 'Inkjet Cartridges',
 'Laptop Computers',
 'Latex Paint',
 'Leaves',
 'Lithium Batteries',
 'Lumber',
 'Magazines',
 'Manure',
 'Mattresses',
 'Medications',
 'Microwaves',
 'Mixed Paper',
 'Motor Oil',
 'Nail Polish',


In [7]:
# === Create series to export to csv === #
label_series = pd.Series(label_list)

In [32]:
# === Export series to csv === #
# Though really it's a newline-separated text file
label_series.to_csv("recycle-pytrends.txt", sep="\n", index=False)

---

## Tallying the Votes

Along with the items list that only includes the items which have Google Trends data (above), several of the team manually pruned the list to the items they thought would be useful. The code below will tally up all of the votes to find the most commonly chosen items from the manual and trends processes.

> Note: we did not end up using this method to choose the pruned list. See below.

In [9]:
# === Counts the number of "votes" for each label === #
from collections import Counter

items_count = Counter()  # Instantiate the counter
items_lists = {}  # Dict to hold the words from each list

# Directory containing manual lists
lists_manual_dir = "lists_manual"

# Load each file, adding each item within to the counter
for f in os.listdir(lists_manual_dir):
    with open(f"{lists_manual_dir}/{f}", "r") as cf:
        items_lists[f] = []
        for item in cf.readlines():
            item = item.strip()
            items_count[item] += 1

            items_lists[f].append(item)

In [10]:
# === Look at the top 100 labels === #
items_count.most_common(100)

[('Aerosol Cans - Full', 6),
 ('Alkaline Batteries', 6),
 ('Aluminum Foil', 6),
 ('CDs', 6),
 ('Cables', 6),
 ('Car Batteries', 6),
 ('Cell Phones', 6),
 ('Crayons', 6),
 ('Inkjet Cartridges', 6),
 ('Lithium Batteries', 6),
 ('Medications', 6),
 ('Mixed Paper', 6),
 ('Propane Tanks', 6),
 ('Toothbrushes', 6),
 ('Aluminum Beverage Cans', 5),
 ('Ammunition', 5),
 ('Cookware', 5),
 ('Envelopes', 5),
 ('Eyeglasses', 5),
 ('Fabric', 5),
 ('Fire Extinguishers', 5),
 ('Floppy Disks', 5),
 ('Halogen Bulbs', 5),
 ('Incandescent Lightbulbs', 5),
 ('Mattresses', 5),
 ('Paper Cups', 5),
 ('Pizza Boxes', 5),
 ('Shoes', 5),
 ('Tires', 5),
 ('Toothpaste Tubes', 5),
 ('Desktop Computers', 5),
 ('Laptop Computers', 5),
 ('Newspaper', 5),
 ('Printers', 5),
 ('Aluminum Food Cans', 4),
 ('Brown Glass Containers', 4),
 ('Button Cell Batteries', 4),
 ('Candy Wrappers', 4),
 ('Carpet', 4),
 ('Cassette Tapes', 4),
 ('Chip Bags', 4),
 ('Cigarettes', 4),
 ('Cooking Oil', 4),
 ('Corks', 4),
 ('Household Furnitur

In [11]:
# === Load the Counter into a Series === #
items_count_dict = dict(items_count)
items_series = pd.Series(items_count_dict)

In [12]:
# === Save to csv === #
items_series.to_csv("count_result.txt", sep="\n")

---

## List of All Earth911 Items

The file `items_list_groups.txt` contains a list of all the items and their parent "family".

In [13]:
# === Load the items list from csv === #
df_all = pd.read_csv("lists_export/items_list_groups.txt")

In [14]:
# Scope it out
df_all.head()

Unnamed: 0,category,item
0,Automotive,Antifreeze
1,Automotive,Auto Bodies
2,Automotive,Auto Parts
3,Automotive,Bike Tires
4,Automotive,Brake Fluid


In [16]:
# === Items List Tallies === #
# Loop through each list to add tallies to df_all
# Load each file, adding each item within to the counter

# Copy dataframe to make cell re-runnable
df2 = df_all.copy()

for f in os.listdir(lists_manual_dir):
    person_name = f[9:-4]  # Get the name to use as column header
    with open(f"{lists_manual_dir}/{f}", "r") as cf:
        df2[person_name] = 0  # Create blank column
        for item in cf.readlines():
            item = item.strip()
            if item in df2["item"].values:
                # Add a tally to each item in each manual list
                df2 = (
                    df2.update_where(
                    conditions=(df2["item"] == item),
                    target_column_name=person_name,
                    target_val=1)
                )

In [17]:
# === Reorder columns === #
the_new_order = [  # Like Star Wars
    "item",
    "category",
    "pytrends",
    "colin",
    "tobias",
    "timothy",
    "trevor",
    "vera",
]

df2 = (df2.reorder_columns(the_new_order))

In [55]:
# === Export to csv for posterity === #
df2.to_csv("item_groups_tallies.csv", index=False)

In [18]:
# === Look at the whole dataframe === #
pd.options.display.max_rows = 400
df2.head(361)

Unnamed: 0,item,category,pytrends,colin,tobias,timothy,trevor,vera
0,Antifreeze,Automotive,1,1,0,0,1,1
1,Auto Bodies,Automotive,0,1,0,0,0,0
2,Auto Parts,Automotive,1,1,0,0,1,1
3,Bike Tires,Automotive,0,1,0,1,0,0
4,Brake Fluid,Automotive,0,1,0,0,1,0
5,Car Batteries,Automotive,1,1,1,1,1,1
6,Car Fluids,Automotive,0,1,0,0,1,0
7,Engine Degreasers,Automotive,0,0,0,0,0,0
8,Gas/Oil Mixture,Automotive,0,1,0,0,0,0
9,Gasoline,Automotive,1,1,0,0,0,0


In [19]:
# === New column from sum of human votes === #
# There's probably a way to do this with a list comp...
df2["manual_sum"] = df2["colin"] + df2["tobias"] + df2["timothy"] + df2["trevor"] + df2["vera"]

In [20]:
df2.head()

Unnamed: 0,item,category,pytrends,colin,tobias,timothy,trevor,vera,manual_sum
0,Antifreeze,Automotive,1,1,0,0,1,1,3
1,Auto Bodies,Automotive,0,1,0,0,0,0,1
2,Auto Parts,Automotive,1,1,0,0,1,1,3
3,Bike Tires,Automotive,0,1,0,1,0,0,2
4,Brake Fluid,Automotive,0,1,0,0,1,0,2


In [21]:
# === Filter dataframe to get pruned list === #
# The conditional will be that either pytrends is 1 or the sum of the manual is >=4

df3 = df2[(df2["pytrends"] > 0) | (df2["manual_sum"] >= 4)]

In [22]:
# The result is 144 items - much more manageable than 294
df3.shape

(144, 9)

In [24]:
df3.head(64)

Unnamed: 0,item,category,pytrends,colin,tobias,timothy,trevor,vera,manual_sum
0,Antifreeze,Automotive,1,1,0,0,1,1,3
2,Auto Parts,Automotive,1,1,0,0,1,1,3
5,Car Batteries,Automotive,1,1,1,1,1,1,5
9,Gasoline,Automotive,1,1,0,0,0,0,1
13,Motor Oil,Automotive,1,1,0,0,1,1,3
15,Oil Filters,Automotive,1,0,1,0,0,1,2
19,Tires,Automotive,1,1,1,0,1,1,4
20,Transmission Fluid,Automotive,1,0,0,0,0,0,0
22,Vehicles,Automotive,1,1,0,0,0,0,1
23,Alkaline Batteries,Batteries,1,1,1,1,1,1,5


In [25]:
# === Save only the "item" and "category" to file === #
df4 = df3["category"] + "," + df3["item"]
df4.head()

0        Automotive,Antifreeze
2        Automotive,Auto Parts
5     Automotive,Car Batteries
9          Automotive,Gasoline
13        Automotive,Motor Oil
dtype: object

In [82]:
# Once again saving as text file, as it's easier for others to view
df4.to_csv("items_list_v2.txt", index=False)