In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [37]:
#DATASET LOADING
# Load the dataset
file_path = "C:\\Users\\arock\\Downloads\\DOP\\DOP\\Work\\Cleaneddataset\\Raw\\Urban.csv"
df = pd.read_csv(file_path)

In [38]:
#DESCRIPTION
# Display basic information about the dataset
print("Dataset Information:")
print("--------------------")

# Number of samples and features
num_samples, num_features = df.shape
print(f"Number of samples: {num_samples}")
print(f"Number of features: {num_features}")

# Display the column names (features)
print("\nFeatures:")
print(", ".join(df.columns))

# Display the target variable(s) if applicable
# Replace 'target_column' with the actual name of your target column
if 'target_column' in df.columns:
    print("\nTarget Variable:")
    print("target_column")

# Display information about categorical variables
categorical_features = df.select_dtypes(include=['object']).columns
if not categorical_features.empty:
    print("\nCategorical Variables:")
    print(", ".join(categorical_features))
    print("\nCategorical Variable Descriptions:")
    for feature in categorical_features:
        print(f"{feature}: {df[feature].nunique()} unique values")

# Display information about numerical variables
numerical_features = df.select_dtypes(include=['number']).columns
if not numerical_features.empty:
    print("\nNumerical Variables:")
    print(", ".join(numerical_features))
    print("\nNumerical Variable Descriptions:")
    print(df[numerical_features].describe())

Dataset Information:
--------------------
Number of samples: 116112
Number of features: 23

Features:
From Date, To Date, PM2.5 (ug/m3), PM10 (ug/m3), NO (ug/m3), NO2 (ug/m3), NOx (ppb), NH3 (ug/m3), SO2 (ug/m3), CO (mg/m3), Ozone (ug/m3), Benzene (ug/m3), Toluene (ug/m3), Eth-Benzene (ug/m3), MP-Xylene (ug/m3), O Xylene (ug/m3), Temp (degree C), RH (%), WS (m/s), WD (deg), SR (W/mt2), BP (mmHg), VWS (m/s)

Categorical Variables:
From Date, To Date

Categorical Variable Descriptions:
From Date: 116112 unique values
To Date: 116112 unique values

Numerical Variables:
PM2.5 (ug/m3), PM10 (ug/m3), NO (ug/m3), NO2 (ug/m3), NOx (ppb), NH3 (ug/m3), SO2 (ug/m3), CO (mg/m3), Ozone (ug/m3), Benzene (ug/m3), Toluene (ug/m3), Eth-Benzene (ug/m3), MP-Xylene (ug/m3), O Xylene (ug/m3), Temp (degree C), RH (%), WS (m/s), WD (deg), SR (W/mt2), BP (mmHg), VWS (m/s)

Numerical Variable Descriptions:
       PM2.5 (ug/m3)  PM10 (ug/m3)    NO (ug/m3)   NO2 (ug/m3)     NOx (ppb)  \
count   60808.000000  165

In [39]:
#DAILY AVERAGE

# Convert the 'From Date' column to datetime
df['From Date'] = pd.to_datetime(df['From Date'])

# Set the 'From Date' column as the index
df.set_index('From Date', inplace=True)

# Resample the data to daily frequency, calculating the mean for each day
daily_average_df = df.resample('D').mean()

# Resetting index to make 'From Date' a column again
daily_average_df.reset_index(inplace=True)

# Displaying the first few rows of the daily average data
daily_average_df.head()

  daily_average_df = df.resample('D').mean()


Unnamed: 0,From Date,PM2.5 (ug/m3),PM10 (ug/m3),NO (ug/m3),NO2 (ug/m3),NOx (ppb),NH3 (ug/m3),SO2 (ug/m3),CO (mg/m3),Ozone (ug/m3),...,Eth-Benzene (ug/m3),MP-Xylene (ug/m3),O Xylene (ug/m3),Temp (degree C),RH (%),WS (m/s),WD (deg),SR (W/mt2),BP (mmHg),VWS (m/s)
0,2010-01-01,,,24.654167,59.051667,83.707083,,5.865833,1.345,13.74625,...,,,,27.39,82.993333,1.08125,186.357083,177.01125,1007.354583,0.569167
1,2010-01-02,,,14.33,42.58125,56.910833,,4.857083,1.356667,15.637083,...,,,,27.422917,81.672083,1.221667,171.53875,148.947917,1008.689583,0.57625
2,2010-01-03,,,19.585,41.037917,60.590417,,5.42875,1.724583,23.526667,...,,,,27.277917,74.052083,1.15875,116.677917,130.445417,1008.852917,0.5725
3,2010-01-04,,,39.664583,85.539583,125.204167,,10.783333,2.34375,15.16125,...,,,,25.834583,81.450417,0.922083,198.394167,203.066667,1007.555833,0.560417
4,2010-01-05,,,38.105833,94.573333,132.67875,,5.738333,0.98125,10.564167,...,,,,25.382917,83.610833,0.932917,196.884167,163.877083,1005.595,0.556667


In [40]:
# Exporting the daily average dataset to a CSV file
output_file_path = "C:\\Users\\arock\\Downloads\\DOP\\DOP\\Work\\Cleaneddataset\\DA\\urbanNEW.csv"
daily_average_df.to_csv(output_file_path, index=False)

output_file_path


'C:\\Users\\arock\\Downloads\\DOP\\DOP\\Work\\Cleaneddataset\\DA\\urbanNEW.csv'

In [45]:
from sklearn.impute import KNNImputer

# Load the dataset into a DataFrame
file_path = "C:\\Users\\arock\\Downloads\\DOP\\DOP\\Work\\Cleaneddataset\\DA\\urbanNEW.csv"
data = pd.read_csv(file_path)

# Dropping columns with a high number of missing values
columns_to_drop = [
    'Benzene (ug/m3)', 'Toluene (ug/m3)', 'Eth-Benzene (ug/m3)', 
    'MP-Xylene (ug/m3)', 'O Xylene (ug/m3)', 'VWS (m/s)'
]
data_reduced = data.drop(columns=columns_to_drop)

# Initialize the KNN Imputer
imputer = KNNImputer(n_neighbors=12)

# Excluding the 'From Date' column from imputation as it's a date
impute_columns = data_reduced.columns[1:]  # Adjust this if your date column has a different name or position
data_reduced[impute_columns] = imputer.fit_transform(data_reduced[impute_columns])

# Check if the imputation was successful
data_reduced.isnull().sum(), data_reduced.head()

(From Date          0
 PM2.5 (ug/m3)      0
 PM10 (ug/m3)       0
 NO (ug/m3)         0
 NO2 (ug/m3)        0
 NOx (ppb)          0
 NH3 (ug/m3)        0
 SO2 (ug/m3)        0
 CO (mg/m3)         0
 Ozone (ug/m3)      0
 Temp (degree C)    0
 RH (%)             0
 WS (m/s)           0
 WD (deg)           0
 SR (W/mt2)         0
 BP (mmHg)          0
 dtype: int64,
     From Date  PM2.5 (ug/m3)  PM10 (ug/m3)  NO (ug/m3)  NO2 (ug/m3)  \
 0  2010-01-01      51.370402     68.469360   24.654167    59.051667   
 1  2010-01-02      49.381167     73.121487   14.330000    42.581250   
 2  2010-01-03      77.015053    122.804808   19.585000    41.037917   
 3  2010-01-04      40.527361     72.189591   39.664583    85.539583   
 4  2010-01-05      71.506838    101.431053   38.105833    94.573333   
 
     NOx (ppb)  NH3 (ug/m3)  SO2 (ug/m3)  CO (mg/m3)  Ozone (ug/m3)  \
 0   83.707083    10.940518     5.865833    1.345000      13.746250   
 1   56.910833    10.697693     4.857083    1.356667     

In [46]:
# Define the path for the output file
output_file_path = "C:\\Users\\arock\\Downloads\\DOP\\DOP\\Work\\Cleaneddataset\\DA_KNN\\UrbanDA_KNNnew.csv"

# Export the imputed dataset to a new CSV file
data_reduced.to_csv(output_file_path, index=False)

# Provide the file path for download
output_file_path

'C:\\Users\\arock\\Downloads\\DOP\\DOP\\Work\\Cleaneddataset\\DA_KNN\\UrbanDA_KNNnew.csv'