# Data Cleaning
## Assignment 
### Process Data Climate Change on Crop
### Step 1: Import Libraries Start by importing the necessary libraries:

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Step 2: Load the Dataset Load the dataset into a pandas DataFrame:

In [3]:
# Load the dataset
df = pd.read_csv('climate_change_agriculture_dataset.csv')  # Make sure the file path is correct
# Display the first few rows
df.head()

Unnamed: 0,Temperature,Precipitation,CO2 Levels,Crop Yield,Soil Health,Extreme Weather Events,Crop Disease Incidence,Water Availability,Food Security,Economic Impact
0,7,59,329,483,10,Drought,Low,High,Low,High
1,39,20,426,679,8,Heatwave,High,Low,High,Low
2,18,46,403,587,5,Flood,Low,Medium,Low,Medium
3,9,91,356,220,5,Heatwave,Medium,Medium,High,Medium
4,35,12,325,538,1,Storm,Medium,Medium,High,High


### Step 3: Explore the Dataset Understand the structure of the data:

In [4]:
# Get a summary of the dataset
df.info()

# Get a statistical summary of the numerical columns
df.describe()

# Check for missing values
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Temperature             1000 non-null   int64 
 1   Precipitation           1000 non-null   int64 
 2   CO2 Levels              1000 non-null   int64 
 3   Crop Yield              1000 non-null   int64 
 4   Soil Health             1000 non-null   int64 
 5   Extreme Weather Events  1000 non-null   object
 6   Crop Disease Incidence  1000 non-null   object
 7   Water Availability      1000 non-null   object
 8   Food Security           1000 non-null   object
 9   Economic Impact         1000 non-null   object
dtypes: int64(5), object(5)
memory usage: 78.3+ KB


Temperature               0
Precipitation             0
CO2 Levels                0
Crop Yield                0
Soil Health               0
Extreme Weather Events    0
Crop Disease Incidence    0
Water Availability        0
Food Security             0
Economic Impact           0
dtype: int64

In [6]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values for numerical columns with the mean
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# For categorical columns, fill missing values with the mode
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

Temperature               0
Precipitation             0
CO2 Levels                0
Crop Yield                0
Soil Health               0
Extreme Weather Events    0
Crop Disease Incidence    0
Water Availability        0
Food Security             0
Economic Impact           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [8]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Select numerical columns for scaling
numerical_columns = ['Temperature', 'Precipitation', 'CO2 Levels']

# Apply the scaler to the selected columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# View the scaled data
print(df.head())


   Temperature  Precipitation  CO2 Levels  Crop Yield  Soil Health  \
0    -1.215649       0.372264   -1.277600         483           10   
1     0.930126      -0.936280    0.420428         679            8   
2    -0.478038      -0.063917    0.017803         587            5   
3    -1.081538       1.445941   -0.804953         220            5   
4     0.661905      -1.204699   -1.347622         538            1   

  Extreme Weather Events Crop Disease Incidence Water Availability  \
0                Drought                    Low               High   
1               Heatwave                   High                Low   
2                  Flood                    Low             Medium   
3               Heatwave                 Medium             Medium   
4                  Storm                 Medium             Medium   

  Food Security Economic Impact  
0           Low            High  
1          High             Low  
2           Low          Medium  
3          High       

In [10]:
from scipy import stats
import numpy as np

# Remove rows where Z-score is greater than 3 (or less than -3)
df_no_outliers = df[(np.abs(stats.zscore(df[['Temperature', 'Precipitation', 'CO2 Levels']])) < 3).all(axis=1)]


In [12]:
# Create an interaction feature between Temperature and Precipitation
df['Temp_Precip_Interaction'] = df['Temperature'] * df['Precipitation']

# Optionally, create polynomial features for non-linear relationships
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(df[['Temperature', 'Precipitation']])


In [13]:
# One-Hot Encoding for categorical variables
df = pd.get_dummies(df, columns=['Extreme Weather Events', 'Soil Health'], drop_first=True)

# View the updated DataFrame
print(df.head())


   Temperature  Precipitation  CO2 Levels  Crop Yield Crop Disease Incidence  \
0    -1.215649       0.372264   -1.277600         483                    Low   
1     0.930126      -0.936280    0.420428         679                   High   
2    -0.478038      -0.063917    0.017803         587                    Low   
3    -1.081538       1.445941   -0.804953         220                 Medium   
4     0.661905      -1.204699   -1.347622         538                 Medium   

  Water Availability Food Security Economic Impact  Temp_Precip_Interaction  \
0               High           Low            High                -0.452542   
1                Low          High             Low                -0.870859   
2             Medium           Low          Medium                 0.030555   
3             Medium          High          Medium                -1.563839   
4             Medium          High            High                -0.797396   

   Extreme Weather Events_Flood  ...  Extrem

In [14]:
from sklearn.model_selection import train_test_split

# Define features and target variable
X = df[['Temperature', 'Precipitation', 'CO2 Levels', 'Temp_Precip_Interaction']]
y = df['Crop Yield']

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [15]:
# Save the processed DataFrame to a CSV file
df.to_csv('processed_data.csv', index=False)