In [49]:
import pandas as pd
import numpy as np
import csv

In [50]:
data = pd.read_csv('AirQualityUCI.csv', usecols=range(15), delimiter=",")

In [51]:
# Checking for missing values marked as -200 in the dataset
missing_values_count = (data == -200).sum()
missing_values_count

Date                0
Time                0
CO(GT)           1683
PT08.S1(CO)       366
NMHC(GT)         8443
C6H6(GT)          366
PT08.S2(NMHC)     366
NOx(GT)          1639
PT08.S3(NOx)      366
NO2(GT)          1642
PT08.S4(NO2)      366
PT08.S5(O3)       366
T                 366
RH                366
AH                366
dtype: int64

In [52]:
# Replace -200 with NaN in numeric columns only
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_columns] = data[numeric_columns].replace(-200, np.nan)

# Calculate the number of missing values per row
missing_values_per_row = data.isna().sum(axis=1)
missing_values_per_row

0       0
1       0
2       0
3       0
4       0
       ..
9352    1
9353    1
9354    1
9355    1
9356    1
Length: 9357, dtype: int64

In [53]:
# Select columns from the third to the last
columns_to_check = data.columns[2:]

# Drop rows where all selected columns are NaN
data_cleaned = data.dropna(how='all', subset=columns_to_check)

# Display the shape of the original and cleaned datasets to see how many rows were dropped
original_shape = data.shape
cleaned_shape = data_cleaned.shape

original_shape, cleaned_shape

((9357, 15), (9326, 15))

In [56]:
# Apply linear interpolation to fill missing values
data_cleaned[numeric_columns] = data_cleaned[numeric_columns].interpolate(method='linear')

# Check for any remaining missing values
remaining_missing_values = data_cleaned.isna().sum()
remaining_missing_values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned[numeric_columns] = data_cleaned[numeric_columns].interpolate(method='linear')


Date             0
Time             0
CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
T                0
RH               0
AH               0
dtype: int64

In [57]:
data_cleaned.to_csv('Processed_AirQualityUCI.csv')