#### Key Features of the Code
##### 1. Missing Values: Handles missing numerical values by replacing them with the column mean.
##### 2. Datetime Processing: Converts datetime into year, month, day, and hour for analysis.
##### 3. Normalisation: Scales hyperspectral data for better model performance.
##### 4. Output: Saves the preprocessed dataset to a CSV file.

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv(r"E:\Data Analytics\Hyperspectral Soil Moisture\soilmoisture_dataset.csv")

In [5]:
print('Initial Data Overview:')
print(data.head())

Initial Data Overview:
   index             datetime  soil_moisture  soil_temperature       454  \
0      0  2017-05-23 14:06:17          33.51              34.8  0.082131   
1      1  2017-05-23 14:08:17          33.49              35.2  0.079510   
2      2  2017-05-23 14:10:17          33.46              35.4  0.080599   
3      3  2017-05-23 14:12:17          33.33              35.0  0.078024   
4      4  2017-05-23 14:14:17          33.32              35.3  0.079973   

        458       462       466       470       474  ...       914       918  \
0  0.055863  0.050047  0.047925  0.047498  0.046464  ...  0.152845  0.152615   
1  0.055326  0.049116  0.047579  0.046745  0.046833  ...  0.152715  0.153102   
2  0.054065  0.049150  0.047537  0.046525  0.046032  ...  0.152362  0.152158   
3  0.054972  0.049052  0.047920  0.046883  0.046775  ...  0.153281  0.153990   
4  0.055335  0.049314  0.047373  0.047028  0.046963  ...  0.152803  0.153085   

        922       926       930       9

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 679 entries, 0 to 678
Columns: 129 entries, index to 950
dtypes: float64(127), int64(1), object(1)
memory usage: 684.4+ KB


In [7]:
#Handle missing values
# Fill numerical columns with the mean

numerical_columns = data.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
data[numerical_columns] = imputer.fit_transform(data[numerical_columns])

In [8]:
#Convert datetime to a datetime object and extract useful features
if 'datetime' in data.columns:
    data['datetime'] = pd.to_datetime(data['datetime'])
    data['year'] = data['datetime'].dt.year
    data['month'] = data['datetime'].dt.month
    data['day'] = data['datetime'].dt.day
    data['hour'] = data['datetime'].dt.hour
    data.drop(columns=['datetime'], inplace=True)

In [9]:
#Normalise hyperspectral data
hyperspectral_columns = [col for col in data.columns if col.isdigit()]
scaler = StandardScaler()
data[hyperspectral_columns] = scaler.fit_transform(data[hyperspectral_columns])

In [10]:
print('\nPreprocessed Data Overview:')
print(data.head())


Preprocessed Data Overview:
   index  soil_moisture  soil_temperature       454       458       462  \
0    0.0          33.51              34.8 -0.820790 -0.933076 -0.911946   
1    1.0          33.49              35.2 -0.913318 -0.962229 -0.968456   
2    2.0          33.46              35.4 -0.874888 -1.030703 -0.966399   
3    3.0          33.33              35.0 -0.965790 -0.981437 -0.972359   
4    4.0          33.32              35.3 -0.896976 -0.961759 -0.956435   

        466       470       474       478  ...       930       934       938  \
0 -0.918558 -0.895983 -0.937692 -0.916785  ... -0.844442 -0.848906 -0.847418   
1 -0.940140 -0.942926 -0.914833 -0.937676  ... -0.878898 -0.883832 -0.864402   
2 -0.942754 -0.956636 -0.964497 -0.940893  ... -0.855017 -0.851385 -0.823587   
3 -0.918834 -0.934312 -0.918411 -0.906604  ... -0.852920 -0.851014 -0.828813   
4 -0.953026 -0.925257 -0.906755 -0.911004  ... -0.843084 -0.825724 -0.800567   

        942       946       950  year  

In [11]:
#Saving the preprocessed data
output_path = 'preprocessed_dataset.csv'
data.to_csv(output_path, index=False)
print(f"Preprocessed data saved to {output_path}")

Preprocessed data saved to preprocessed_dataset.csv
