In [1]:
# Importing the pandas library to work with data structures like DataFrames and Series.
import pandas as pd

In [2]:
# Importing the numpy library for numerical computations and array manipulations.
import numpy as np

In [3]:
# Importing the os module to interact with the operating system for file and directory management.
import os

In [5]:
# Getting the current working directory path.
os.getcwd()

'd:\\1 DS PROJECTS\\Crop prediction'

In [6]:
# Reading the FAOSTAT.csv file into a DataFrame named 'data' using pandas.

data=pd.read_csv("FAOSTAT.csv")

In [None]:
# Displaying the first 20 rows of the 'data' DataFrame.
data.head(20)

In [8]:
# Removing rows from the 'data' DataFrame where the "Element" column has the value "Stocks".
data.drop(data[data["Element"] == "Stocks"].index, axis=0, inplace=True)

In [9]:
# Filling missing 'Value' entries in rows where 'Element' is 'Area harvested' 
# with the mode of 'Value' for those rows.

mode_area_harvested = data[data['Element'] == 'Area harvested']['Value'].mode()[0]
data.loc[(data['Element'] == 'Area harvested') & (data['Value'].isna()), 'Value'] = mode_area_harvested

In [10]:
# Filling missing 'Value' entries in rows where 'Element' is 'Yield' with the mode of 'Value' for those rows.

mode_area_harvested = data[data['Element'] == 'Yield']['Value'].mode()[0]
data.loc[(data['Element'] == 'Yield') & (data['Value'].isna()), 'Value'] = mode_area_harvested

In [11]:
# Filling missing 'Value' entries in rows where 'Element' is 'Production' with the mode of 'Value' for those rows.

mode_area_harvested = data[data['Element'] == 'Production']['Value'].mode()[0]
data.loc[(data['Element'] == 'Production') & (data['Value'].isna()), 'Value'] = mode_area_harvested

In [None]:
# Checking the total number of missing (NaN) values in each column of the 'data' DataFrame.

data.isna().sum()

In [None]:
# Counting the number of non-null values in each column of the 'data' DataFrame.

data.count()

In [14]:
# Dropping the "Note" column from the 'data' DataFrame.

data.drop(columns="Note", inplace=True)

In [15]:
# Dropping the "Year Code" column from the 'data' DataFrame.

data.drop(columns="Year Code", inplace=True)

In [16]:
# Dropping the "Flag" column from the 'data' DataFrame.

data.drop(columns="Flag", inplace=True)

In [17]:
# Dropping the "Flag Description" column from the 'data' DataFrame.

data.drop(columns="Flag Description", inplace=True)

In [None]:
# Displaying the unique values in the "Element" column of the 'data' DataFrame.

data["Element"].unique()

In [20]:
# Dropping rows from the 'data' DataFrame where the 'Value' column has missing (NaN) values.

data.dropna(subset='Value',inplace=True)

In [None]:
# Checking the total number of missing (NaN) values in each column of the 'data' DataFrame after dropping rows.

data.isna().sum()

In [22]:
# Creating new columns for each element in 'elements' by extracting 'Value'
#  where the 'Element' matches, and setting other values to None.


elements = ['Area harvested', 'Yield', 'Stocks',
       'Producing Animals/Slaughtered', 'Laying', 'Yield/Carcass Weight',
       'Milk Animals','Production']

for element in elements:
    Tranformed_Columns = data['Element'] == element
    data[f'{element}_Value'] = data['Value'].where(Tranformed_Columns, None)

In [23]:
# Renaming columns in the 'data' DataFrame for clarity, 
# changing specific column names related to area, yield, and production.


data = data.rename(columns={"Area harvested_Value": "Area_Harvested_in_Hectares"})
data = data.rename(columns={"Yield_Value": "Yield_Value in kg/ha"})
data = data.rename(columns={"Production_Value": "Production in Hectares"})

In [24]:
# Dropping the "Element" column from the 'data' DataFrame.

data.drop("Element",axis = 1,inplace = True)

In [25]:
# Dropping the "Value" column from the 'data' DataFrame.
data.drop("Value",axis = 1,inplace = True)

In [None]:
# Dropping the "Unit" column from the 'data' DataFrame.

data.drop("Unit",axis = 1,inplace = True)

In [28]:
# Dropping the "Element Code" column from the 'data' DataFrame.

data.drop("Element Code",axis = 1,inplace = True)

In [None]:
# Displaying the first 25 rows of the 'data' DataFrame.

data.head(25)

In [None]:
# Counting the number of non-null values in each column of the 'data' DataFrame.

data.count()

In [None]:
# Displaying the first 5 rows of the 'data' DataFrame.

data.head()

In [None]:
# Grouping the 'data' DataFrame by multiple columns and 
# aggregating specific columns using the sum function, 
# then displaying the first 25 rows of the grouped data.


grouped_crop_Dataframe = data.groupby(
    ['Domain', 'Area', 'Item','Item Code (CPC)', 'Year'], as_index=False
).agg({
    'Area_Harvested_in_Hectares': 'sum',  
    'Yield_Value in kg/ha': 'sum',  
    'Producing Animals/Slaughtered_Value': 'sum',  
    'Laying_Value': 'sum',  
    'Yield/Carcass Weight_Value': 'sum',  
    'Milk Animals_Value': 'sum',  
    'Production in Hectares': 'sum'  
})


grouped_crop_Dataframe.head(25)

In [106]:
# Calculate the mean of the target column
mean_production = grouped_crop_Dataframe['Production in Hectares'].mean()

# Calculate the standard deviation of the target column
std_production = grouped_crop_Dataframe['Production in Hectares'].std()

# Calculate the standard deviation percentage (coefficient of variation)
std_percentage = (std_production / mean_production) * 100

In [None]:
std_percentage

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = grouped_crop_Dataframe['Production in Hectares'].quantile(0.25)
Q3 = grouped_crop_Dataframe['Production in Hectares'].quantile(0.75)

# Calculate the Interquartile Range (IQR)
IQR = Q3 - Q1

# Determine the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = grouped_crop_Dataframe[(grouped_crop_Dataframe['Production in Hectares'] < lower_bound) | (grouped_crop_Dataframe['Production in Hectares'] > upper_bound)]

# Display results
print("Lower bound:", lower_bound)
print("Upper bound:", upper_bound)
print("Outliers:")
print(outliers)

In [None]:
# Displaying the first 5 rows of the 'grouped_crop_Dataframe'.

grouped_crop_Dataframe.head()

In [None]:
grouped_crop_Dataframe[grouped_crop_Dataframe['Production in Hectares']==0.0]

In [None]:
# Checking the total number of missing (NaN) values in each column of the 'grouped_crop_Dataframe'.

grouped_crop_Dataframe.isnull().sum()

In [113]:
#it shows the total number of rows and no of columns of the dataframe
grouped_crop_Dataframe.shape

(87097, 12)

In [None]:
#This attribute probides the index of the data frame
grouped_crop_Dataframe.index

In [None]:
#name of column names 
grouped_crop_Dataframe.columns

In [None]:
#shows the type of each data
grouped_crop_Dataframe.dtypes 

In [None]:
grouped_crop_Dataframe['Year'].unique()
 #in column it shows unique values in a column it can applied only on single column only

In [None]:
grouped_crop_Dataframe['Area'].nunique()
 #in column it shows unique values in a column it can applied only on single column only

In [None]:
# Displaying the number of unique values in each column of the 'grouped_crop_Dataframe'.
grouped_crop_Dataframe.nunique()

In [None]:
# Counting the number of non-null values in each column of the 'grouped_crop_Dataframe'.
grouped_crop_Dataframe.count()

In [None]:
grouped_crop_Dataframe.value_counts() #in a column shows all unique values with their count

In [None]:
# Displaying concise summary information about the 'grouped_crop_Dataframe', 
# including the number of non-null entries and data types for each column.
grouped_crop_Dataframe.info()

In [123]:
# Importing the seaborn library for statistical data visualization.

import seaborn as sns

In [None]:
# Displaying the first 5 rows of the 'grouped_crop_Dataframe'.

grouped_crop_Dataframe.head()

In [125]:
# Importing the matplotlib library for creating static, animated, and interactive visualizations in Python.

import matplotlib.pyplot as plt

In [None]:

# Grouping the 'grouped_crop_Dataframe' by 'Area' and summing the values,
#  then plotting a barplot of the top 10 areas by "Area Harvested in Hectares"
#  with rotation of x-axis labels for better readability.


grouped_crop_DF = grouped_crop_Dataframe.groupby('Area', as_index=False).sum()

# Get the top 10 areas by 'Area_Harvested_in_Hectares'
top_10_areas = grouped_crop_DF.nlargest(10, 'Area_Harvested_in_Hectares')

# Plot the barplot with the top 10 areas
plt.figure(figsize=(12, 6))
sns.barplot(x="Area", y="Area_Harvested_in_Hectares", data=top_10_areas, palette="viridis")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.title("Top 10 Areas by Area Harvested in Hectares")
plt.xlabel("Area")
plt.ylabel("Area Harvested in Hectares")
plt.tight_layout()
plt.show()

In [None]:
grouped_crop_DF = grouped_crop_Dataframe.groupby('Area', as_index=False).sum()

# Get the top 10 areas by 'Area_Harvested_in_Hectares'
top_10_areas = grouped_crop_DF.nlargest(10, 'Yield_Value in kg/ha')

# Plot the barplot with the top 10 areas
plt.figure(figsize=(12, 6))
sns.barplot(x="Area", y="Yield_Value in kg/ha", data=top_10_areas, palette="viridis")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.title("Top 10 Areas by Yield_Value in kg/ha")
plt.xlabel("Area")
plt.ylabel("Yield")
plt.tight_layout()
plt.show()

In [None]:
grouped_crop_DF = grouped_crop_Dataframe.groupby('Area', as_index=False).sum()

# Get the top 10 areas by 'Area_Harvested_in_Hectares'
top_10_areas = grouped_crop_DF.nlargest(10, 'Production in Hectares')

# Plot the barplot with the top 10 areas
plt.figure(figsize=(12, 6))
sns.barplot(x="Area", y="Production in Hectares", data=top_10_areas, palette="viridis")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.title("Top 10 Areas by Production in Hectares")
plt.xlabel("Area")
plt.ylabel("Production")
plt.tight_layout()
plt.show()

In [None]:
# Displaying the first 5 rows of the 'grouped_crop_Dataframe'.
grouped_crop_Dataframe.head()

In [None]:
# Displaying the first 5 rows of the 'grouped_crop_Dataframe' after dropping the "Domain" and "Area" columns.
grouped_crop_Dataframe.head()

In [None]:
# Filtering the 'grouped_crop_Dataframe' to display rows where the "Production in Hectares" column has a value of 0.0.
grouped_crop_Dataframe[grouped_crop_Dataframe["Production in Hectares"]==0.0]

In [132]:
# Creating a new DataFrame 'DF1' by filtering out rows where the "Production in Hectares" column has a value of 0.0.
DF1=grouped_crop_Dataframe[grouped_crop_Dataframe["Production in Hectares"] != 0.0]

In [None]:
# Displaying the content of the 'DF1' DataFrame, which contains rows where
#  "Production in Hectares" is not equal to 0.0.
DF1

In [None]:
# Filtering the 'DF1' DataFrame to display rows where the "Production in Hectares"
#  column has a value of 0.0. Since 'DF1' was already filtered to exclude these rows, 
# the result should be an empty DataFrame.
DF1[DF1["Production in Hectares"]==0.0]

In [None]:
# Displaying the first 5 rows of the 'DF1' DataFrame.
DF1.head()

In [None]:
# Checking the total number of missing (NaN) values in each column of the 'DF1' DataFrame.
DF1.isna().sum()

In [None]:
# Displaying the content of the 'DF1' DataFrame,
#  which contains rows where "Production in Hectares" is not equal to 0.0.
DF1

In [None]:
# Displaying the data types of each column in the 'DF1' DataFrame.
DF1.dtypes

In [None]:
 # Display unique values to identify the issue
DF1['Item Code (CPC)'].unique() 

In [None]:
# Filtering the 'DF1' DataFrame to display rows where the "Item Code (CPC)" column has the value '2351f'.
DF1[DF1['Item Code (CPC)']=='2351f']

In [None]:
# Replacing the value '2351f' with '2351' in the "Item Code (CPC)" column of the 'DF1' DataFrame.
DF1['Item Code (CPC)']=DF1['Item Code (CPC)'].replace('2351f','2351')

In [None]:
# Displaying the data types of each column in the 'DF1' 
# DataFrame after replacing the value in the "Item Code (CPC)" column.
DF1.dtypes

In [None]:
# Converting the "Item Code (CPC)" column in the 'DF1' DataFrame to the float data type.
DF1['Item Code (CPC)']=DF1['Item Code (CPC)'].astype(float)

In [None]:
DF1.dtypes

In [None]:
# Set the specified columns' values to 0.0

columns_to_zero = [
    'Area_Harvested_in_Hectares',
    'Yield_Value in kg/ha',
    'Producing Animals/Slaughtered_Value',
    'Laying_Value',
    'Yield/Carcass Weight_Value',
    'Milk Animals_Value'
]

# Setting the values to 0.0

rows_with_zero = DF1[DF1[columns_to_zero].eq(0).all(axis=1)]

rows_with_zero

In [174]:
DF1.to_csv("cleansed_crop_data.csv",index=False)

In [145]:
# Importing the StandardScaler class from scikit-learn to standardize features by
#  removing the mean and scaling to unit variance.
from sklearn.preprocessing import StandardScaler

In [146]:
# Selecting specific columns from 'DF1' as features (x) and the 'Production in Hectares' column as the target variable (y) for machine learning.
x=DF1[['Item Code (CPC)','Area_Harvested_in_Hectares','Yield_Value in kg/ha','Producing Animals/Slaughtered_Value','Laying_Value','Yield/Carcass Weight_Value','Milk Animals_Value']]
y=DF1['Production in Hectares']

In [147]:
# Importing the train_test_split function from scikit-learn to split the dataset into training and testing sets.
from sklearn.model_selection import train_test_split

In [148]:
# Importing the LinearRegression model from scikit-learn to perform linear regression for predicting continuous values.
from sklearn.linear_model import LinearRegression

In [149]:
# Initializing a LinearRegression model object to be used for training and making predictions.
model=LinearRegression()

In [150]:
# Splitting the dataset into training and testing sets with 80% for training and 20% for testing, 
# using a fixed random seed for reproducibility.
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [151]:
# Training the LinearRegression model using the training data (x_train and y_train).
model.fit(x_train,y_train)

In [152]:
# Using the trained LinearRegression model to make predictions on the test data (x_test).
y_predict=model.predict(x_test)

In [None]:
# Displaying the predicted values (y_predict) generated by the model for the test data (x_test).
y_predict

In [86]:
# Importing the mean_squared_error and r2_score functions from scikit-learn to evaluate the performance of the model.
from sklearn.metrics import mean_squared_error, r2_score

In [154]:
# Evaluate the model
r2 = model.score(x_test, y_test)  # R-squared score
mse = mean_squared_error(y_test, y_predict)  # Mean Squared Error

In [None]:
r2

In [None]:
mse

In [157]:
# Importing the RandomForestRegressor model from scikit-learn to perform regression using an ensemble of decision trees.
from sklearn.ensemble import RandomForestRegressor

In [158]:
# Initializing a RandomForestRegressor model object to be used for training and making predictions with a random forest algorithm.
rf= RandomForestRegressor()

In [159]:
rf.fit(x_train,y_train)

In [161]:
# Create a DataFrame from the given dictionary using pandas

data_dict = {
    
    "Item Code (CPC)":[1371],
    "Area_Harvested_in_Hectares": [22134.0],
    "Yield_Value in kg/ha": [1775.9],
    "Producing Animals/Slaughtered_Value": [0.0],
    "Laying_Value": [0.0],
    "Yield/Carcass Weight_Value": [0.0],
    "Milk Animals_Value": [0.0]
}

# Convert dictionary to DataFrame
data_df = pd.DataFrame(data_dict)

In [162]:
#Using the trained RandomForestRegressor model to make predictions on the dataset 'data_df'.
y_pred = rf.predict(data_df)

In [163]:
#  1371	2020	22134.0	1775.9	0.0	0.0	0.0	0.0	39307.00

In [None]:
# Displaying the predicted values (y_pred) generated by the RandomForestRegressor model for the input data (`data_df`).
y_pred 

In [165]:
# Evaluate the model
ran = rf.score(x_test, y_test)  # R-squared score
mse1 = mean_squared_error(y_test, y_predict)  # Mean Squared Error

In [None]:
ran

In [None]:
mse1

In [168]:
# Import PolynomialFeatures for generating polynomial feature sets
#from sklearn.preprocessing import PolynomialFeatures

In [169]:
# Create polynomial features of degree 2 and fit them to the training data
#poly = PolynomialFeatures(degree=2)
#X_poly = poly.fit(x_train,y_train)

In [None]:
# Install Streamlit for building web apps and Joblib for saving/loading models
!pip install streamlit joblib

In [172]:
# Import Joblib for saving and loading machine learning models
import joblib

In [None]:
# Save the trained Random Forest model to a file using Joblib
joblib.dump(rf, r"D:\1 DS PROJECTS\Crop prediction\model.pkl")
