In [None]:
import pandas as pd

weather_data = pd.read_csv("/content/Plant_1_Weather_Sensor_Data.csv")
generation_data = pd.read_csv("/content/Plant_1_Generation_Data.csv")

# Convert DATE_TIME column to standard format
weather_data["DATE_TIME"] = pd.to_datetime(weather_data["DATE_TIME"], format="%m/%d/%Y %H:%M")
generation_data["DATE_TIME"] = pd.to_datetime(generation_data["DATE_TIME"], format="%d-%m-%Y %H:%M")

print(weather_data["DATE_TIME"].head())
print(generation_data["DATE_TIME"].head())


0   2020-05-15 00:00:00
1   2020-05-15 00:15:00
2   2020-05-15 00:30:00
3   2020-05-15 00:45:00
4   2020-05-15 01:00:00
Name: DATE_TIME, dtype: datetime64[ns]
0   2020-05-15
1   2020-05-15
2   2020-05-15
3   2020-05-15
4   2020-05-15
Name: DATE_TIME, dtype: datetime64[ns]


⬆️ Data Loading – Read and combined generation & weather datasets.

In [None]:
# Merge datasets on DATE_TIME and PLANT_ID
merged_data = pd.merge(generation_data, weather_data, on=["DATE_TIME", "PLANT_ID"], how="inner")

print(merged_data.head())

merged_data.to_csv("Merged_Plant_Data.csv", index=False)


   DATE_TIME  PLANT_ID     SOURCE_KEY_x  DC_POWER  AC_POWER  DAILY_YIELD  \
0 2020-05-15   4135001  1BY6WEcLGh8j5v7       0.0       0.0          0.0   
1 2020-05-15   4135001  1IF53ai7Xc0U56Y       0.0       0.0          0.0   
2 2020-05-15   4135001  3PZuoBAID5Wc2HD       0.0       0.0          0.0   
3 2020-05-15   4135001  7JYdWkrLSPkdwr4       0.0       0.0          0.0   
4 2020-05-15   4135001  McdE0feGgRqW7Ca       0.0       0.0          0.0   

   TOTAL_YIELD     SOURCE_KEY_y  AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  \
0    6259559.0  HmiyD2TTLFNqkNe            25.184316           22.857507   
1    6183645.0  HmiyD2TTLFNqkNe            25.184316           22.857507   
2    6987759.0  HmiyD2TTLFNqkNe            25.184316           22.857507   
3    7602960.0  HmiyD2TTLFNqkNe            25.184316           22.857507   
4    7158964.0  HmiyD2TTLFNqkNe            25.184316           22.857507   

   IRRADIATION  
0          0.0  
1          0.0  
2          0.0  
3          0.0  
4

In [None]:
print(merged_data.isnull().sum())


DATE_TIME              0
PLANT_ID               0
SOURCE_KEY_x           0
DC_POWER               0
AC_POWER               0
DAILY_YIELD            0
TOTAL_YIELD            0
SOURCE_KEY_y           0
AMBIENT_TEMPERATURE    0
MODULE_TEMPERATURE     0
IRRADIATION            0
dtype: int64


In [None]:
print(merged_data[['DC_POWER', 'AC_POWER']].describe())


           DC_POWER      AC_POWER
count  68774.000000  68774.000000
mean    3147.177450    307.778375
std     4036.441826    394.394865
min        0.000000      0.000000
25%        0.000000      0.000000
50%      428.571429     41.450000
75%     6365.468750    623.561161
max    14471.125000   1410.950000


In [None]:
zero_dc = (merged_data["DC_POWER"] == 0).sum()
zero_ac = (merged_data["AC_POWER"] == 0).sum()
zero_ac = (merged_data["TOTAL_YIELD"] == 0).sum()
zero_ac = (merged_data["IRRADIATION"] == 0).sum()

print(f"Zero DC Power: {zero_dc} instances")
print(f"Zero AC Power: {zero_ac} instances")
print(f"Zero Total yeild: {zero_ac} instances")
print(f"Zero Irradiation: {zero_ac} instances")


Zero DC Power: 31951 instances
Zero AC Power: 30398 instances
Zero Total yeild: 30398 instances
Zero Irradiation: 30398 instances


⬇️ Data Cleansing – Standardized DATE_TIME, removed redundant columns, and analyzed zero values.

In [None]:
import pandas as pd

# Extract hour from DATE_TIME
merged_data["Hour"] = merged_data["DATE_TIME"].dt.hour

columns_to_check = ["DC_POWER", "AC_POWER", "TOTAL_YIELD", "IRRADIATION"]

# Count zero values per hour
zero_summary = merged_data[merged_data[columns_to_check].eq(0).any(axis=1)]
zero_count_by_hour = zero_summary.groupby("Hour")[columns_to_check].count()

print(zero_count_by_hour)


      DC_POWER  AC_POWER  TOTAL_YIELD  IRRADIATION
Hour                                              
0         2724      2724         2724         2724
1         2726      2726         2726         2726
2         2810      2810         2810         2810
3         2812      2812         2812         2812
4         2815      2815         2815         2815
5         2707      2707         2707         2707
6          138       138          138          138
9            2         2            2            2
10           2         2            2            2
11           6         6            6            6
12          19        19           19           19
13          27        27           27           27
14           6         6            6            6
15           1         1            1            1
18         903       903          903          903
19        2669      2669         2669         2669
20        2924      2924         2924         2924
21        2962      2962       

We have added a new column by extracting Hour from DATE_TIME.

This Will Show
*  A table with hours (0-23) as rows.
*  Counts of how many times each column had zero values at that hour.

Based on these result, most zero values appear at night (0:00 - 6:00, 18:00 - 23:00) and during some early morning or late afternoon hours when sunlight is weaker. This is expected for a solar power plant because:

*   Nighttime (0:00 - 6:00, 19:00 - 23:00) → No sunlight → No power generation, no irradiation ✅
*   Early morning (6:00 - 9:00) & late afternoon (15:00 - 18:00) → Low sunlight → Some zeros, but should be minimal
*   Daytime (10:00 - 14:00) → Few zeros → Good! Power is being generated ✅

What This Means for Data Cleaning


1.   Zeros at night (expected) → No need to remove them.
2.   Zeros during daylight (rare, but exists)
 *   Might be temporary weather conditions (e.g., clouds, rain).
 *   Might indicate sensor faults (unlikely, but possible).
 *   Since very few zeros appear between 9:00 - 15:00, it’s not a major issue.









In [None]:
# Drop SOURCE_KEY and PLANT_ID columns
merged_data = merged_data.drop(columns=["SOURCE_KEY_x", "SOURCE_KEY_y", "PLANT_ID"])

print(merged_data.head())

merged_data.to_csv("Cleaned_Plant_Data.csv", index=False)


   DATE_TIME  DC_POWER  AC_POWER  DAILY_YIELD  TOTAL_YIELD  \
0 2020-05-15       0.0       0.0          0.0    6259559.0   
1 2020-05-15       0.0       0.0          0.0    6183645.0   
2 2020-05-15       0.0       0.0          0.0    6987759.0   
3 2020-05-15       0.0       0.0          0.0    7602960.0   
4 2020-05-15       0.0       0.0          0.0    7158964.0   

   AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  IRRADIATION  Hour  
0            25.184316           22.857507          0.0     0  
1            25.184316           22.857507          0.0     0  
2            25.184316           22.857507          0.0     0  
3            25.184316           22.857507          0.0     0  
4            25.184316           22.857507          0.0     0  


What Below Code Does
* Adds Power_Efficiency = AC_POWER / DC_POWER
* Handles division by zero (fills NaN with 0)

This is a column we additionaly added using columns AC_POWER & DC_POWER

In [None]:
import pandas as pd

df = pd.read_csv("Cleaned_Plant_Data.csv")

# Compute Power Efficiency (handling division by zero)
df["Power_Efficiency"] = df["AC_POWER"] / df["DC_POWER"]
df["Power_Efficiency"] = df["Power_Efficiency"].fillna(0)

df.to_csv("Cleaned_Plant_Data.csv", index=False)

print("Power_Efficiency column added successfully!")


Power_Efficiency column added successfully!
