In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# loading the data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url = "/kaggle/input/crop-water-requirement/DATASET - Sheet1.csv"

In [None]:
#Load the data
df = pd.read_csv(url)

In [None]:
#Display the first Five 
df.head(5)

In [None]:
#display the last five 
df.tail(5)

In [None]:
#To display water requirement
df['WATER REQUIREMENT']

# **Describing the data**

In [None]:
#describe() method returns description of the data in the DataFrame.
df.describe()

In [None]:
#To display number of rows and columns
df.shape

In [None]:
#To display number of columns, column labels, column data types, memory usage, range index, and the number of cells in each column (non-null values).
df.info()

In [None]:
#To return the number of unique values for each column
df.nunique()

# DATA WRANGLING

In [None]:
#To display the number of empty values (NaN,NAN,na) in each column
df.isna()

In [None]:
#checking missing data
df.isnull().sum()

# This indicates that there are no missing values 

# Data Analysis

In [None]:
#Relationship between weather condition and water requirement of each crop
my_numeric_df = df.select_dtypes(exclude='object')
my_object_df = df.select_dtypes(include='object')

In [None]:
my_numeric_df

In [None]:
my_object_df

# one hot Encoding

In [None]:
df_object_dummies = pd.get_dummies(my_object_df, drop_first = True)

In [None]:
df_object_dummies = df_object_dummies.astype(int)
df_object_dummies

In [None]:
#Merging the two
final_df = pd.concat([my_numeric_df , df_object_dummies], axis = 1)

In [None]:
final_df

In [None]:
#we can calcluate corelation

final_df.corr()['WATER REQUIREMENT'].sort_values()

**corelation**

In [None]:
correlation_matrix = final_df.corr()

# Sort the correlation values with respect to 'WATER REQUIREMENT'
sorted_correlation = correlation_matrix['WATER REQUIREMENT'].sort_values()

# Create a subset of the correlation matrix with the sorted columns
subset_corr_matrix = correlation_matrix.loc[sorted_correlation.index, sorted_correlation.index]

# Create a heatmap using Seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(subset_corr_matrix, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# The corelation of water requirement with other columns

In [None]:
#Calculate the correlation between 'WATER REQUIREMENT' and other columns
correlation_series = final_df.corr()['WATER REQUIREMENT'].sort_values()

# Creating a bar plot
plt.figure(figsize=(12, 8))
correlation_series.plot(kind='bar')
plt.title('Correlation with WATER REQUIREMENT')
plt.xlabel('Column')
plt.ylabel('Correlation')
plt.xticks(rotation=90) 
plt.show()

# Split Data into Training and Testing Sets

In [None]:
X = final_df.drop(columns=['WATER REQUIREMENT'])
y = final_df['WATER REQUIREMENT']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, min_samples_leaf = 4,min_samples_split =10)


In [None]:
model.fit(X_train, y_train)

# # Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score

# Predictions
y_pred = model.predict(X_test)

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

# R-squared (R²) - Coefficient of determination
r2 = r2_score(y_test, y_pred)
print(f'R-squared (R²): {r2}')