In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path =  "/content/drive/MyDrive/Kaggel.xlsx"  # Update the path if needed
df = pd.read_excel(file_path)

# Display column names to ensure correct selection
print("Columns in dataset:", df.columns)

# If there's a 'date' column, ensure it's in datetime format
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by="date")  # Sort by time

# Select relevant features
features = ['temperature', 'rainfall', 'humidity']
df = df[features].dropna()  # Drop rows with missing values

# Define input (X) and target (y) - same features for validation
X = df[['temperature', 'rainfall', 'humidity']]
y = df[['temperature', 'rainfall', 'humidity']]

# Time-based split (80% train, 20% test)
train_size = int(len(df) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Print dataset sizes
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Columns in dataset: Index(['temperature', 'rainfall', 'humidity'], dtype='object')
Train size: 1760, Test size: 440


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib  # To save the trained model


# Time-based split (80% train, 20% test)
train_size = int(len(df) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")

# Save the trained model for future validation
joblib.dump(model, "weather_model.pkl")
print("Model saved as weather_model.pkl")

Mean Absolute Error: 2.38
Model saved as weather_model.pkl


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error
import pandas as pd
import joblib
import numpy as np

# Load the trained model
model = joblib.load("weather_model.pkl")

# Read the real-time data
def read_api_data(csv_file):
    df = pd.read_excel(csv_file)
    return df

# Predict using the model
def predict_weather(temperature, rainfall, humidity):
    input_data = np.array([[temperature, rainfall, humidity]])
    predicted_values = model.predict(input_data)[0]
    return predicted_values

# Compare predictions with real values
def evaluate_model(df):
    real_values = []
    predicted_values = []

    for _, row in df.iterrows():
        real_temp, real_rain, real_hum = row['temperature'], row['rainfall'], row['humidity']
        pred_temp, pred_rain, pred_hum = predict_weather(real_temp, real_rain, real_hum)

        real_values.append([real_temp, real_rain, real_hum])
        predicted_values.append([pred_temp, pred_rain, pred_hum])

    real_values = np.array(real_values)
    predicted_values = np.array(predicted_values)

    # Calculate accuracy metrics
    r2 = r2_score(real_values, predicted_values)
    mae = mean_absolute_error(real_values, predicted_values)

    print(f"R² Score: {r2:.4f}")  # Closer to 1 is better
    print(f"Mean Absolute Error: {mae:.2f}")  # Lower is better

# Load real-time data
csv_file = "/content/drive/MyDrive/THRD.xlsx"
real_data = read_api_data(csv_file)

# Evaluate the model
evaluate_model(real_data)



R² Score: 0.3717
Mean Absolute Error: 2.80


In [None]:
import pandas as pd
import joblib
import numpy as np

# Load the trained model
model = joblib.load("weather_model.pkl")

# Step 1: Read the real-time data from the Excel file
def read_api_data(excel_file):
    # Load the Excel file
    df = pd.read_excel(excel_file)

    # Trim column names to avoid spaces
    df.columns = df.columns.str.strip()

    # Display the first few rows to ensure data is correct
    print(df.head())

    return df

# Step 2: Predict expected values using the model
def predict_weather(temperature, rainfall, humidity):
    # Prepare the input for the model
    input_data = np.array([[temperature, rainfall, humidity]])

    # Predict values using the trained model
    predicted_values = model.predict(input_data)

    # Extract the predicted values
    predicted_temp, predicted_rain, predicted_hum = predicted_values[0]

    return predicted_temp, predicted_rain, predicted_hum

# Step 3: Compare real-time data with predicted values and flag anomalies
def compare_and_flag_anomalies(df, threshold=5):
    anomalies = []

    # Iterate over each row of the data
    for index, row in df.iterrows():
        real_temp = row['temperature']
        real_rain = row['rainfall']
        real_hum = row['humidity']

        # Predict expected values
        pred_temp, pred_rain, pred_hum = predict_weather(real_temp, real_rain, real_hum)

        # Calculate percentage differences
        temp_diff = abs(real_temp - pred_temp) / (real_temp + 1e-6) * 100
        rain_diff = abs(real_rain - pred_rain) / (real_rain + 1e-6) * 100
        hum_diff = abs(real_hum - pred_hum) / (real_hum + 1e-6) * 100

        # Check if any of the differences exceed the threshold
        if temp_diff > threshold or rain_diff > threshold or hum_diff > threshold:
            anomalies.append({
                'index': index,
                'real_temp': real_temp, 'pred_temp': pred_temp, 'temp_diff (%)': round(temp_diff, 2),
                'real_rain': real_rain, 'pred_rain': pred_rain, 'rain_diff (%)': round(rain_diff, 2),
                'real_hum': real_hum, 'pred_hum': pred_hum, 'hum_diff (%)': round(hum_diff, 2)
            })

    return anomalies

# Provide the correct path to your Excel file
excel_file =  "/content/drive/MyDrive/THRD.xlsx"  # Update with your actual file path
real_data = read_api_data(excel_file)

# Compare and flag anomalies
anomalies = compare_and_flag_anomalies(real_data, threshold=5)

# Display the anomalies detected
if anomalies:
    print("\nAnomalies detected:")
    for anomaly in anomalies:
        print(anomaly)
else:
    print("\nNo anomalies detected.")


   temperature  rainfall  humidity
0       27.929   213.516        74
1       26.716    93.400        74
2       30.558   169.275        72
3       25.916    67.875        77
4       25.287   281.041        78





Anomalies detected:
{'index': 0, 'real_temp': 27.929, 'pred_temp': 27.349437975499978, 'temp_diff (%)': 2.08, 'real_rain': 213.516, 'pred_rain': 211.30321611500003, 'rain_diff (%)': 1.04, 'real_hum': 74.0, 'pred_hum': 83.61984733790001, 'hum_diff (%)': 13.0}
{'index': 1, 'real_temp': 26.716, 'pred_temp': 24.246391423599995, 'temp_diff (%)': 9.24, 'real_rain': 93.4, 'pred_rain': 93.66910319539997, 'rain_diff (%)': 0.29, 'real_hum': 74.0, 'pred_hum': 74.08185330669998, 'hum_diff (%)': 0.11}
{'index': 3, 'real_temp': 25.916, 'pred_temp': 25.650850374199962, 'temp_diff (%)': 1.02, 'real_rain': 67.875, 'pred_rain': 67.37948595310002, 'rain_diff (%)': 0.73, 'real_hum': 77.0, 'pred_hum': 81.10309091359993, 'hum_diff (%)': 5.33}
{'index': 4, 'real_temp': 25.287, 'pred_temp': 25.25554333900003, 'temp_diff (%)': 0.12, 'real_rain': 281.041, 'pred_rain': 281.0992803389999, 'rain_diff (%)': 0.02, 'real_hum': 78.0, 'pred_hum': 82.11809860069994, 'hum_diff (%)': 5.28}
{'index': 6, 'real_temp': 21.82



In [None]:
import numpy as np
import pandas as pd
from scipy.stats import entropy

# Load datasets
training_data = pd.read_excel( "/content/drive/MyDrive/Kaggel.xlsx" )  # Reference distribution
api_data = pd.read_excel( "/content/drive/MyDrive/THRD.xlsx")  # Real-time API data

# Select relevant features (temperature, rainfall, humidity)
features = ["temperature", "rainfall", "humidity"]

# Function to calculate KL divergence
def kl_divergence(p_data, q_data, bins=20):
    kl_values = {}
    for feature in features:
        # Compute histograms (probability distributions)
        p_hist, bin_edges = np.histogram(p_data[feature], bins=bins, density=True)
        q_hist, _ = np.histogram(q_data[feature], bins=bin_edges, density=True)

        # Replace zeros to avoid log(0) errors
        q_hist = np.where(q_hist == 0, 1e-10, q_hist)

        # Compute KL divergence
        kl = entropy(p_hist, q_hist)
        kl_values[feature] = kl

    return kl_values

# Calculate KL divergence for each feature
kl_results = kl_divergence(training_data, api_data)

# Print results
for feature, kl_value in kl_results.items():
    print(f"KL Divergence for {feature}: {kl_value:.4f}")


KL Divergence for temperature: 3.2584
KL Divergence for rainfall: 4.8245
KL Divergence for humidity: 13.9556


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import entropy

# Load datasets
training_data = pd.read_excel( "/content/drive/MyDrive/Kaggel 1.xlsx" )  # Reference distribution
api_data = pd.read_excel( "/content/drive/MyDrive/API 1.xlsx")  # Real-time API data

# Strip spaces from column names in the API data
api_data.columns = api_data.columns.str.strip()

# Now check the columns again to ensure they match
print("API Data Columns after stripping spaces:", api_data.columns)

# Print the column names to check for issues
print("Training Data Columns:", training_data.columns)
print("API Data Columns:", api_data.columns)


# Select relevant features (temperature, rainfall, humidity)
features = ["temperature", "rainfall", "humidity"]


# If 'rainfall' has spaces in either dataset, rename it
training_data = training_data.rename(columns={'rainfall ': 'rainfall'})
api_data = api_data.rename(columns={'rainfall ': 'rainfall'})


# Function to calculate KL divergence
def kl_divergence(p_data, q_data, bins=20):
    kl_values = {}
    for feature in features:
        # Compute histograms (probability distributions)
        p_hist, bin_edges = np.histogram(p_data[feature], bins=bins, density=True)
        q_hist, _ = np.histogram(q_data[feature], bins=bin_edges, density=True)

        # Replace zeros to avoid log(0) errors
        q_hist = np.where(q_hist == 0, 1e-10, q_hist)

        # Compute KL divergence
        kl = entropy(p_hist, q_hist)
        kl_values[feature] = kl

    return kl_values

# Calculate KL divergence for each feature
kl_results = kl_divergence(training_data, api_data)

# Print results
for feature, kl_value in kl_results.items():
    print(f"KL Divergence for {feature}: {kl_value:.4f}")


API Data Columns after stripping spaces: Index(['temperature', 'rainfall', 'humidity'], dtype='object')
Training Data Columns: Index(['temperature', 'rainfall', 'humidity'], dtype='object')
API Data Columns: Index(['temperature', 'rainfall', 'humidity'], dtype='object')
KL Divergence for temperature: 5.9728
KL Divergence for rainfall: nan
KL Divergence for humidity: 8.5639


  return n/db/n.sum(), bin_edges
