In [225]:
%pip install --quiet pandas numpy matplotlib seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Note: you may need to restart the kernel to use updated packages.


In [226]:
# read the dataset
df = pd.read_csv('climate_action_data.csv')

# inspect the dataset
print(df.head())
print(df.info())
print(df.describe())
print(df.shape)
print(df.dtypes)



  Sensor_ID        Date Soil_Moisture(%) Soil_pH Temperature(C) Humidity(%)  \
0  SEN-1000  2025-01-01            36.22    6.98           21.3        50.4   
1  SEN-1001  2025-01-02            76.55    5.03           23.5        34.3   
2  SEN-1002  2025-01-03            61.24    6.52           21.9        54.6   
3  SEN-1003  2025-01-04            51.91    6.62           24.0        48.7   
4  SEN-1004  2025-01-05            20.92    5.98           19.2        70.6   

  Crop_Type Fertilizer_Recommended(kg/ha) Irrigation_Recommended(mm)  \
0  Tomatoes                          67.6                       26.6   
1     Wheat                         130.4                        8.3   
2  Tomatoes                          36.0                       29.4   
3     Maize                          85.8                       19.6   
4     Wheat                          75.6                       29.9   

  Drone_Image_ID  
0       IMG-2000  
1       IMG-2001  
2       IMG-2002  
3       IMG-2003

In [227]:
# identify empty cells
df.isna().sum()

Sensor_ID                        0
Date                             4
Soil_Moisture(%)                 0
Soil_pH                          0
Temperature(C)                   0
Humidity(%)                      0
Crop_Type                        4
Fertilizer_Recommended(kg/ha)    0
Irrigation_Recommended(mm)       0
Drone_Image_ID                   0
dtype: int64

In [228]:
# numeric_cols = list(df.columns)
numeric_cols = ['Soil_pH','Temperature(C)','Humidity(%)','Fertilizer_Recommended(kg/ha)','Irrigation_Recommended(mm)','Soil_Moisture(%)']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [229]:
# identify empty cells
df.isna().sum()
df.head()



Unnamed: 0,Sensor_ID,Date,Soil_Moisture(%),Soil_pH,Temperature(C),Humidity(%),Crop_Type,Fertilizer_Recommended(kg/ha),Irrigation_Recommended(mm),Drone_Image_ID
0,SEN-1000,2025-01-01,36.22,6.98,21.3,50.4,Tomatoes,67.6,26.6,IMG-2000
1,SEN-1001,2025-01-02,76.55,5.03,23.5,34.3,Wheat,130.4,8.3,IMG-2001
2,SEN-1002,2025-01-03,61.24,6.52,21.9,54.6,Tomatoes,36.0,29.4,IMG-2002
3,SEN-1003,2025-01-04,51.91,6.62,24.0,48.7,Maize,85.8,19.6,IMG-2003
4,SEN-1004,2025-01-05,20.92,5.98,19.2,70.6,Wheat,75.6,29.9,IMG-2004


In [230]:
for col in numeric_cols:
    df[col].fillna(df[col].mode(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode(), inplace=True)


In [231]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

print(df['Date'].isna().sum())
# Print the most frequent date in the 'Date' column
print("Most frequent date:", df['Date'].mode()[0])
# frequency of most frequent date
print("Frequency of most frequent date:", df['Date'].value_counts().iloc[0])

4
Most frequent date: 2025-01-04 00:00:00
Frequency of most frequent date: 2


In [232]:
# show duplicate count
print("Duplicate rows count:", df.duplicated().sum())


Duplicate rows count: 111


In [233]:
# remove duplicates

df.drop_duplicates(inplace=True)
# show duplicate count after removing duplicates
print("Duplicate rows count after removing duplicates:", df.duplicated().sum())

Duplicate rows count after removing duplicates: 0


In [234]:
print(df.isna().sum())
print(df.shape)

Sensor_ID                        0
Date                             4
Soil_Moisture(%)                 5
Soil_pH                          5
Temperature(C)                   5
Humidity(%)                      5
Crop_Type                        4
Fertilizer_Recommended(kg/ha)    5
Irrigation_Recommended(mm)       5
Drone_Image_ID                   0
dtype: int64
(802, 10)


In [235]:
# since the rest of the data has no duplicate dates. I will drop the null dates
df.dropna(subset=['Date'], inplace=True)
print(df.shape)
# check for duplicates again
print("Duplicate rows count after removing null dates:", df['Date'].isna().sum())


(798, 10)
Duplicate rows count after removing null dates: 0


In [237]:
print("frequency of crop types",df['Crop_Type'].value_counts())
print("Missing crop types:", df['Crop_Type'].isna().sum())




frequency of crop types Crop_Type
Wheat       172
Tomatoes    166
Maize       155
Beans       151
Lettuce     150
Name: count, dtype: int64
Missing crop types: 4


In [None]:
df.dropna(subset=['Crop_Type'], inplace=True)
print(df['Crop_Type'].value_counts())
# check for empty cells again
print(df.isna().sum())   
print(df.shape)

Crop_Type
Wheat       172
Tomatoes    166
Maize       155
Beans       151
Lettuce     150
Name: count, dtype: int64
Sensor_ID                        0
Date                             0
Soil_Moisture(%)                 5
Soil_pH                          5
Temperature(C)                   5
Humidity(%)                      5
Crop_Type                        0
Fertilizer_Recommended(kg/ha)    5
Irrigation_Recommended(mm)       5
Drone_Image_ID                   0
dtype: int64
(794, 10)
