In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('2003_2017_waste.csv')

In [None]:
df.head()

Unnamed: 0,waste_type,waste_disposed_of_tonne,total_waste_recycled_tonne,total_waste_generated_tonne,recycling_rate,year
0,Food,679900,111100.0,791000,0.14,2016
1,Paper/Cardboard,576000,607100.0,1183100,0.51,2016
2,Plastics,762700,59500.0,822200,0.07,2016
3,C&D,9700,1585700.0,1595400,0.99,2016
4,Horticultural waste,111500,209000.0,320500,0.65,2016


In [None]:
df.shape

(225, 6)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   waste_type                   225 non-null    object 
 1   waste_disposed_of_tonne      225 non-null    int64  
 2   total_waste_recycled_tonne   225 non-null    float64
 3   total_waste_generated_tonne  225 non-null    int64  
 4   recycling_rate               225 non-null    float64
 5   year                         225 non-null    int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 10.7+ KB


In [None]:
#duplicated date
df.duplicated().sum()

np.int64(0)

In [None]:
#missing data
df.isna().sum()

Unnamed: 0,0
waste_type,0
waste_disposed_of_tonne,0
total_waste_recycled_tonne,0
total_waste_generated_tonne,0
recycling_rate,0
year,0


In [None]:
for col in df.columns:
  if df[col].dtype == 'object':
    print(f'{col}: {df[col].unique()}')

waste_type: ['Food' 'Paper/Cardboard' 'Plastics' 'C&D' 'Horticultural waste' 'Wood'
 'Ferrous metal' 'Non-ferrous metal' 'Used slag' 'Ash & Sludge' 'Glass'
 'Textile/Leather' 'Scrap tyres' 'Others (stones, ceramics & rubber etc.)'
 'Total' 'Others (stones, ceramics & rubber etc)' 'Food waste'
 'Construction Debris' 'Wood/Timber' 'Horticultural Waste' 'Ferrous Metal'
 'Non-ferrous Metals' 'Used Slag' 'Sludge' 'Scrap Tyres' 'Ferrous Metals'
 'Others' 'Construction debris' 'Non-ferrous metals' 'Ash and sludge'
 'Plastic' 'Others (stones, ceramic, rubber, etc.)']


In [None]:
# --- Mapping dictionary to standardize waste types ---
waste_type_mapping = {
    # Food
    'Food': 'Food',
    'Food waste': 'Food',

    # Paper & Cardboard
    'Paper/Cardboard': 'Paper/Cardboard',

    # Plastics
    'Plastics': 'Plastics',
    'Plastic': 'Plastics',

    # Construction & Demolition
    'C&D': 'Construction & Demolition',
    'Construction Debris': 'Construction & Demolition',
    'Construction debris': 'Construction & Demolition',

    # Horticultural Waste
    'Horticultural waste': 'Horticultural Waste',
    'Horticultural Waste': 'Horticultural Waste',

    # Wood
    'Wood': 'Wood',
    'Wood/Timber': 'Wood',

    # Ferrous Metal
    'Ferrous metal': 'Ferrous Metal',
    'Ferrous Metal': 'Ferrous Metal',
    'Ferrous Metals': 'Ferrous Metal',

    # Non-ferrous Metal
    'Non-ferrous metal': 'Non-ferrous Metal',
    'Non-ferrous Metals': 'Non-ferrous Metal',
    'Non-ferrous metals': 'Non-ferrous Metal',

    # Used Slag
    'Used slag': 'Used Slag',
    'Used Slag': 'Used Slag',

    # Ash & Sludge
    'Ash & Sludge': 'Ash & Sludge',
    'Ash and sludge': 'Ash & Sludge',
    'Sludge': 'Ash & Sludge',

    # Glass
    'Glass': 'Glass',

    # Textile / Leather
    'Textile/Leather': 'Textile/Leather',

    # Scrap Tyres
    'Scrap tyres': 'Scrap Tyres',
    'Scrap Tyres': 'Scrap Tyres',

    # Others
    'Others (stones, ceramics & rubber etc.)': 'Others',
    'Others (stones, ceramics & rubber etc)': 'Others',
    'Others (stones, ceramic, rubber, etc.)': 'Others',
    'Others': 'Others',

    # Total â†’ We will later drop these rows entirely
    'Total': 'Total'
}

# --- Apply category cleaning ---
df['waste_type_clean'] = df['waste_type'].map(waste_type_mapping)

# --- Drop rows where waste type is 'Total' ---
df = df[df['waste_type_clean'] != 'Total']

# Optional: Reset index
df = df.reset_index(drop=True)

# --- Check if any values were not mapped ---
unmapped = df[df['waste_type_clean'].isna()]['waste_type'].unique()
print("Unmapped values:", unmapped)

# --- Preview cleaned data ---
print(df.head())

Unmapped values: []
            waste_type  waste_disposed_of_tonne  total_waste_recycled_tonne  \
0                 Food                   679900                    111100.0   
1      Paper/Cardboard                   576000                    607100.0   
2             Plastics                   762700                     59500.0   
3                  C&D                     9700                   1585700.0   
4  Horticultural waste                   111500                    209000.0   

   total_waste_generated_tonne  recycling_rate  year  \
0                       791000            0.14  2016   
1                      1183100            0.51  2016   
2                       822200            0.07  2016   
3                      1595400            0.99  2016   
4                       320500            0.65  2016   

            waste_type_clean  
0                       Food  
1            Paper/Cardboard  
2                   Plastics  
3  Construction & Demolition  
4        Hort

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   waste_type                   210 non-null    object 
 1   waste_disposed_of_tonne      210 non-null    int64  
 2   total_waste_recycled_tonne   210 non-null    float64
 3   total_waste_generated_tonne  210 non-null    int64  
 4   recycling_rate               210 non-null    float64
 5   year                         210 non-null    int64  
 6   waste_type_clean             210 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 11.6+ KB


In [None]:
#Drop the original waste_type column
df.drop('waste_type', axis=1, inplace=True)

In [None]:
for col in df.columns:
  if df[col].dtype == 'object':
    print(f'{col}: {df[col].unique()}')

waste_type_clean: ['Food' 'Paper/Cardboard' 'Plastics' 'Construction & Demolition'
 'Horticultural Waste' 'Wood' 'Ferrous Metal' 'Non-ferrous Metal'
 'Used Slag' 'Ash & Sludge' 'Glass' 'Textile/Leather' 'Scrap Tyres'
 'Others']


In [None]:
#Creating a column for the Disposal Ratio
df['disposal_ratio'] = df['waste_disposed_of_tonne'] / df['total_waste_generated_tonne']

In [None]:
#Creating a column called for the recycling efficiency
df['recycling_efficiency'] = (df['total_waste_recycled_tonne']/df['total_waste_generated_tonne'])

In [None]:
df.isna().sum().sum()

np.int64(0)

In [None]:
df['year'] = pd.to_datetime(df['year'], format='%Y')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   waste_disposed_of_tonne      210 non-null    int64         
 1   total_waste_recycled_tonne   210 non-null    float64       
 2   total_waste_generated_tonne  210 non-null    int64         
 3   recycling_rate               210 non-null    float64       
 4   year                         210 non-null    datetime64[ns]
 5   waste_type_clean             210 non-null    object        
 6   disposal_ratio               210 non-null    float64       
 7   recycling_efficiency         210 non-null    float64       
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 13.3+ KB


In [None]:
df.to_csv('cleaned_waste_data.csv', index=False)