### Scraped Data Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("out.csv")

In [2]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,,,,,,,,,
1,Agricultural services,Pilot,WET,BOD5,694.4,---,---,84.0,"University College Dublin, 2011"
2,Agricultural services,Pilot,WET,Chemical oxygen demand,1297.5,---,---,84.0,"University College Dublin, 2011"
3,Agricultural services,Pilot,WET,"Nitrogen, total",221.9,---,---,78.0,"University College Dublin, 2011"
4,Agricultural services,Pilot,WET,Phosphorus,32.8,---,---,97.0,"University College Dublin, 2011"


*Each time selenium copies a new table, for some reason it has to put a row with all cells empty (NaN).*
*So We had to drop these rows using this command*

In [3]:
df.dropna(how='all', inplace=True)

In [6]:
df.isna().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1707 entries, 1 to 1741
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1707 non-null   object
 1   1       1707 non-null   object
 2   2       1707 non-null   object
 3   3       1707 non-null   object
 4   4       1707 non-null   object
 5   5       1707 non-null   object
 6   6       1707 non-null   object
 7   7       1707 non-null   object
 8   8       1707 non-null   object
dtypes: object(9)
memory usage: 133.4+ KB


In [8]:
columns = ["Industry", "Scale", "Treatment Train", "Parameter", "Influent", "Effluent", "Units", "%Removal", "Article"]
df.columns = columns

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1707 entries, 1 to 1741
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Industry         1707 non-null   object
 1   Scale            1707 non-null   object
 2   Treatment Train  1707 non-null   object
 3   Parameter        1707 non-null   object
 4   Influent         1707 non-null   object
 5   Effluent         1707 non-null   object
 6   Units            1707 non-null   object
 7   %Removal         1707 non-null   object
 8   Article          1707 non-null   object
dtypes: object(9)
memory usage: 133.4+ KB


In [15]:
df.dtypes

Industry           object
Scale              object
Treatment Train    object
Parameter          object
Influent           object
Effluent           object
Units              object
%Removal           object
Article            object
dtype: object

In [46]:
import re

In [53]:
value = "< 466"
print(re.match(r'<\s*(\d+(\.\d+)?)', value.strip()).group(1))

466


In [57]:
def extract_number(value):
    # Search for a number at the end of the string (allows for text before it)
    match = re.search(r'(\d+(\.\d+)?)\s*$', str(value))  # Match numbers at the end
    if match:
        return float(match.group(1))  # Return the number as a float
    return value  # If no number found, return the original value
    

# Find rows with non-numerical values
new_df = df[["Influent", "Effluent", "%Removal"]].map(extract_number)

new_df

Unnamed: 0,Influent,Effluent,%Removal
1,694.4,---,84.0
2,1297.5,---,84.0
3,221.9,---,78.0
4,32.8,---,97.0
5,33.3,---,94.0
...,...,...,...
1737,7772.0,---,---
1738,70.7,---,---
1739,---,---,---
1740,415.0,---,---


In [62]:
df[["Influent", "Effluent", "%Removal"]] = new_df

In [None]:
import numpy as np

In [66]:
# Assuming df is your DataFrame
df["Influent"] = pd.to_numeric(df["Influent"], errors='coerce').fillna(np.nan)
df["Effluent"] = pd.to_numeric(df["Effluent"], errors='coerce').fillna(np.nan)
df["%Removal"] = pd.to_numeric(df["%Removal"], errors='coerce').fillna(np.nan)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1707 entries, 1 to 1741
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Industry         1707 non-null   object 
 1   Scale            1707 non-null   object 
 2   Treatment Train  1707 non-null   object 
 3   Parameter        1707 non-null   object 
 4   Influent         1472 non-null   float64
 5   Effluent         1448 non-null   float64
 6   Units            1707 non-null   object 
 7   %Removal         1063 non-null   float64
 8   Article          1707 non-null   object 
dtypes: float64(3), object(6)
memory usage: 133.4+ KB
