In [1]:
import pandas as pd
import numpy as np


# Load the dataset
file_path = 'AllDistrict_Production.csv'
data = pd.read_csv(file_path)

# Define the columns for which to fill missing values and remove outliers
columns_to_clean = ['Sunshine hours', 'Fertilizer']  # Replace with the correct column names

# Function to remove outliers using the IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Function to fill missing values in a column with the mean value
def fill_missing_values(df, column):
    mean_value = df[column].mean()
    df[column].fillna(mean_value, inplace=True)

# Check if the columns exist in the dataset and clean them
for column in columns_to_clean:
    if column in data.columns:
        # Fill missing values for the specified column
        fill_missing_values(data, column)
        
        # Remove outliers from the entire dataset for the specified column
        data = remove_outliers(data, column)
    else:
        print(f"The column '{column}' is not found in the dataset.")

# Save the cleaned dataset to a new CSV file
cleaned_file_path = 'AllDistrict_Production_Cleaned_mean.csv'
data.to_csv(cleaned_file_path, index=False)

print(f"Missing values filled, outliers removed, and cleaned dataset saved to '{cleaned_file_path}'")


Missing values filled, outliers removed, and cleaned dataset saved to 'AllDistrict_Production_Cleaned_mean.csv'
