In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = "/content/drive/MyDrive/Kaggel.xlsx"  # Update the path if needed
df = pd.read_excel(file_path)

# Display column names to ensure correct selection
print("Columns in dataset:", df.columns)

# If there's a 'date' column, ensure it's in datetime format
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by="date")  # Sort by time

# Select relevant features
features = ['temperature', 'rainfall', 'humidity']
df = df[features].dropna()  # Drop rows with missing values

# Define input (X) and target (y) - same features for validation
X = df[['temperature', 'rainfall', 'humidity']]
y = df[['temperature', 'rainfall', 'humidity']]

# Time-based split (80% train, 20% test)
train_size = int(len(df) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Print dataset sizes
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Columns in dataset: Index(['temperature', 'rainfall', 'humidity'], dtype='object')
Train size: 1760, Test size: 440


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib  # To save the trained model


# Time-based split (80% train, 20% test)
train_size = int(len(df) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")

# Save the trained model for future validation
joblib.dump(model, "weather_model.pkl")
print("Model saved as weather_model.pkl")

Mean Absolute Error: 2.38
Model saved as weather_model.pkl


In [None]:
import pandas as pd
import joblib
import numpy as np

# Load the trained model
model = joblib.load("weather_model.pkl")

# Step 1: Read the real-time data from the CSV file
def read_api_data(csv_file):
    # Load the CSV file
    df = pd.read_excel(csv_file)

    # Display the first few rows to ensure the data is correct
    print(df.head())

    return df

# Step 2: Predict expected values using the model
def predict_weather(temperature, rainfall, humidity):
    # Prepare the input for the model
    input_data = np.array([[temperature, rainfall, humidity]])

    # Predict values using the trained model
    predicted_values = model.predict(input_data)

    # Extract the predicted values
    predicted_temp, predicted_rain, predicted_hum = predicted_values[0]

    return predicted_temp, predicted_rain, predicted_hum

# Step 3: Compare real-time data with predicted values and flag anomalies
def compare_and_flag_anomalies(df):
    anomalies = []

    # Iterate over each row of the data
    for index, row in df.iterrows():
        real_temp = row['temperature']
        real_rain = row['rainfall ']
        real_hum = row['humidity']

        # Predict expected values
        pred_temp, pred_rain, pred_hum = predict_weather(real_temp, real_rain, real_hum)

        # Calculate the difference between real and predicted values
        temp_diff = abs(real_temp - pred_temp)
        hum_diff = abs(real_rain - pred_rain)
        rain_diff = abs(real_hum - pred_hum)

        # Set an anomaly threshold (e.g., 5% difference)
        threshold = 5

        # Check if any of the differences exceed the threshold
        if temp_diff > threshold or hum_diff > threshold or rain_diff > threshold:
            anomalies.append({
                'index': index,
                'real_temp': real_temp,
                'pred_temp': pred_temp,
                'real_rain': real_rain,
                'pred_rain': pred_rain,
                'real_hum': real_hum,
                'pred_hum': pred_hum,
                'temp_diff': temp_diff,
                'rain_diff': rain_diff,
                'hum_diff': hum_diff
            })

    return anomalies

# Test the function with your API CSV file (update with your actual file path)
csv_file = "/content/drive/MyDrive/API.xlsx"  # Update with the correct path to your CSV file
real_data = read_api_data(csv_file)

# Compare and flag anomalies
anomalies = compare_and_flag_anomalies(real_data)

# Display the anomalies detected
if anomalies:
    print("Anomalies detected:")
    for anomaly in anomalies:
        print(anomaly)
else:
    print("No anomalies detected.")


   temperature  rainfall   humidity
0       27.308      0.187    78.416
1       26.020      0.217    82.014
2       26.849      0.246    82.014
3       27.517      0.247    77.404
4       10.504      0.121    78.301




Anomalies detected:
{'index': 0, 'real_temp': 27.308, 'pred_temp': 27.937106115099983, 'real_rain': 0.187, 'pred_rain': 23.761739147600007, 'real_hum': 78.416, 'pred_hum': 89.35285765710005, 'temp_diff': 0.629106115099983, 'rain_diff': 10.936857657100049, 'hum_diff': 23.574739147600006}
{'index': 1, 'real_temp': 26.02, 'pred_temp': 27.929859250099984, 'real_rain': 0.217, 'pred_rain': 23.711949767099995, 'real_hum': 82.014, 'pred_hum': 89.51419158810006, 'temp_diff': 1.9098592500999843, 'rain_diff': 7.500191588100066, 'hum_diff': 23.494949767099996}
{'index': 2, 'real_temp': 26.849, 'pred_temp': 27.929859250099984, 'real_rain': 0.246, 'pred_rain': 23.711949767099995, 'real_hum': 82.014, 'pred_hum': 89.51419158810006, 'temp_diff': 1.0808592500999836, 'rain_diff': 7.500191588100066, 'hum_diff': 23.465949767099996}
{'index': 3, 'real_temp': 27.517, 'pred_temp': 27.952697838699983, 'real_rain': 0.247, 'pred_rain': 23.733705879600002, 'real_hum': 77.404, 'pred_hum': 89.34483382970005, 'temp_

In [None]:
import pandas as pd
import joblib
import numpy as np

# Load the trained model
model = joblib.load("weather_model.pkl")

# Step 1: Read the real-time data from the Excel file
def read_api_data(excel_file):
    # Load the Excel file
    df = pd.read_excel(excel_file)

    # Trim column names to avoid spaces
    df.columns = df.columns.str.strip()

    # Display the first few rows to ensure data is correct
    print(df.head())

    return df

# Step 2: Predict expected values using the model
def predict_weather(temperature, rainfall, humidity):
    # Prepare the input for the model
    input_data = np.array([[temperature, rainfall, humidity]])

    # Predict values using the trained model
    predicted_values = model.predict(input_data)

    # Extract the predicted values
    predicted_temp, predicted_rain, predicted_hum = predicted_values[0]

    return predicted_temp, predicted_rain, predicted_hum

# Step 3: Compare real-time data with predicted values and flag anomalies
def compare_and_flag_anomalies(df, threshold=5):
    anomalies = []

    # Iterate over each row of the data
    for index, row in df.iterrows():
        real_temp = row['temperature']
        real_rain = row['rainfall']
        real_hum = row['humidity']

        # Predict expected values
        pred_temp, pred_rain, pred_hum = predict_weather(real_temp, real_rain, real_hum)

        # Calculate percentage differences
        temp_diff = abs(real_temp - pred_temp) / (real_temp + 1e-6) * 100
        rain_diff = abs(real_rain - pred_rain) / (real_rain + 1e-6) * 100
        hum_diff = abs(real_hum - pred_hum) / (real_hum + 1e-6) * 100

        # Check if any of the differences exceed the threshold
        if temp_diff > threshold or rain_diff > threshold or hum_diff > threshold:
            anomalies.append({
                'index': index,
                'real_temp': real_temp, 'pred_temp': pred_temp, 'temp_diff (%)': round(temp_diff, 2),
                'real_rain': real_rain, 'pred_rain': pred_rain, 'rain_diff (%)': round(rain_diff, 2),
                'real_hum': real_hum, 'pred_hum': pred_hum, 'hum_diff (%)': round(hum_diff, 2)
            })

    return anomalies

# Provide the correct path to your Excel file
excel_file = "/content/drive/MyDrive/API.xlsx"  # Update with your actual file path
real_data = read_api_data(excel_file)

# Compare and flag anomalies
anomalies = compare_and_flag_anomalies(real_data, threshold=5)

# Display the anomalies detected
if anomalies:
    print("\nAnomalies detected:")
    for anomaly in anomalies:
        print(anomaly)
else:
    print("\nNo anomalies detected.")


   temperature  rainfall  humidity
0       27.308     0.187    78.416
1       26.020     0.217    82.014
2       26.849     0.246    82.014
3       27.517     0.247    77.404
4       10.504     0.121    78.301





Anomalies detected:
{'index': 0, 'real_temp': 27.308, 'pred_temp': 27.937106115099983, 'temp_diff (%)': 2.3, 'real_rain': 0.187, 'pred_rain': 23.761739147600007, 'rain_diff (%)': 12606.74, 'real_hum': 78.416, 'pred_hum': 89.35285765710005, 'hum_diff (%)': 13.95}
{'index': 1, 'real_temp': 26.02, 'pred_temp': 27.929859250099984, 'temp_diff (%)': 7.34, 'real_rain': 0.217, 'pred_rain': 23.711949767099995, 'rain_diff (%)': 10827.12, 'real_hum': 82.014, 'pred_hum': 89.51419158810006, 'hum_diff (%)': 9.15}
{'index': 2, 'real_temp': 26.849, 'pred_temp': 27.929859250099984, 'temp_diff (%)': 4.03, 'real_rain': 0.246, 'pred_rain': 23.711949767099995, 'rain_diff (%)': 9538.97, 'real_hum': 82.014, 'pred_hum': 89.51419158810006, 'hum_diff (%)': 9.15}
{'index': 3, 'real_temp': 27.517, 'pred_temp': 27.952697838699983, 'temp_diff (%)': 1.58, 'real_rain': 0.247, 'pred_rain': 23.733705879600002, 'rain_diff (%)': 9508.75, 'real_hum': 77.404, 'pred_hum': 89.34483382970005, 'hum_diff (%)': 15.43}
{'index':



In [None]:
from sklearn.metrics import r2_score, mean_absolute_error
import pandas as pd
import joblib
import numpy as np

# Load the trained model
model = joblib.load("weather_model.pkl")

# Read the real-time data
def read_api_data(csv_file):
    df = pd.read_excel(csv_file)
    return df

# Predict using the model
def predict_weather(temperature, rainfall, humidity):
    input_data = np.array([[temperature, rainfall, humidity]])
    predicted_values = model.predict(input_data)[0]
    return predicted_values

# Compare predictions with real values
def evaluate_model(df):
    real_values = []
    predicted_values = []

    for _, row in df.iterrows():
        real_temp, real_rain, real_hum = row['temperature'], row['rainfall '], row['humidity']
        pred_temp, pred_rain, pred_hum = predict_weather(real_temp, real_rain, real_hum)

        real_values.append([real_temp, real_rain, real_hum])
        predicted_values.append([pred_temp, pred_rain, pred_hum])

    real_values = np.array(real_values)
    predicted_values = np.array(predicted_values)

    # Calculate accuracy metrics
    r2 = r2_score(real_values, predicted_values)
    mae = mean_absolute_error(real_values, predicted_values)

    print(f"R² Score: {r2:.4f}")  # Closer to 1 is better
    print(f"Mean Absolute Error: {mae:.2f}")  # Lower is better

# Load real-time data
csv_file = "/content/drive/MyDrive/API.xlsx"
real_data = read_api_data(csv_file)

# Evaluate the model
evaluate_model(real_data)




R² Score: -18651.5224
Mean Absolute Error: 11.30




In [None]:
import numpy as np
import pandas as pd
from scipy.stats import entropy

# Load datasets
training_data = pd.read_excel("/content/drive/MyDrive/Kaggel.xlsx")  # Reference distribution
api_data = pd.read_excel("API.xlsx")  # Real-time API data

# Select relevant features (temperature, rainfall, humidity)
features = ["temperature", "rainfall", "humidity"]

# Function to calculate KL divergence
def kl_divergence(p_data, q_data, bins=20):
    kl_values = {}
    for feature in features:
        # Compute histograms (probability distributions)
        p_hist, bin_edges = np.histogram(p_data[feature], bins=bins, density=True)
        q_hist, _ = np.histogram(q_data[feature], bins=bin_edges, density=True)

        # Replace zeros to avoid log(0) errors
        q_hist = np.where(q_hist == 0, 1e-10, q_hist)

        # Compute KL divergence
        kl = entropy(p_hist, q_hist)
        kl_values[feature] = kl

    return kl_values

# Calculate KL divergence for each feature
kl_results = kl_divergence(training_data, api_data)

# Print results
for feature, kl_value in kl_results.items():
    print(f"KL Divergence for {feature}: {kl_value:.4f}")
