### Imports:

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use("ggplot")

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score

import warnings

warnings.filterwarnings("ignore")  # Suppress all warnings

In [None]:
df = pd.read_csv("yield_df.csv")
df.head()

In [None]:
df.drop("Unnamed: 0", axis=1, inplace=True)
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Count the number of duplicate rows in the DataFrame
num_duplicates = df.duplicated().sum()
print(f"The DataFrame contains {num_duplicates} duplicate rows.")

In [None]:
# Remove duplicate rows from the DataFrame
df.drop_duplicates(inplace=True)
num_duplicates = df.duplicated().sum()
print(f"The DataFrame now contains {num_duplicates} duplicate rows.")

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
# df.corr()
# Select only numeric columns
numeric_cols = df.select_dtypes(include=[np.number])

# Calculate correlation matrix ignoring non-numeric values
corr_matrix = numeric_cols.corr(method="spearman")  # Spearman rank correlation
corr_matrix

In [None]:
# Data Visualization

In [None]:
len(df["Area"].unique())

In [None]:
len(df["Item"].unique())

In [None]:
plt.figure(figsize=(15, 20))
sns.countplot(y=df["Area"])
plt.show()

In [None]:
plt.figure(figsize=(15, 20))

# Create the countplot with default color assignment
sns.countplot(
    y="Item", data=df  # Set column for y-axis  # Specify data source (DataFrame)
)

# Customize plot elements (optional)
plt.xlabel("Number of entries")  # Add x-axis label
plt.ylabel("Crop Types")  # Add y-axis label
plt.title("Distribution of Items")  # Add title
plt.xticks(rotation=45)  # Rotate x-axis labels for readability (optional)
plt.tight_layout()  # Adjust spacing between elements

plt.show()

In [None]:
(df["Area"].value_counts() < 400).sum()

In [None]:
country = df["Area"].unique()
yield_per_country = []
for state in country:
    yield_per_country.append(df[df["Area"] == state]["hg/ha_yield"].sum())

In [None]:
df["hg/ha_yield"].sum()

In [None]:
yield_per_country

In [None]:
plt.figure(figsize=(15, 20))
sns.barplot(y=country, x=yield_per_country)
plt.show()

In [None]:
crops = df["Item"].unique()
yield_per_crop = []
for crop in crops:
    yield_per_crop.append(df[df["Item"] == crop]["hg/ha_yield"].sum())

In [None]:
plt.figure(figsize=(15, 20))
sns.barplot(y=crops, x=yield_per_crop)
plt.show()

In [None]:
df.columns

In [None]:
col = [
    "Year",
    "average_rain_fall_mm_per_year",
    "pesticides_tonnes",
    "avg_temp",
    "Area",
    "Item",
    "hg/ha_yield",
]
df = df[col]
df.head()

In [None]:
X = df.drop("hg/ha_yield", axis=1)
y = df["hg/ha_yield"]
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ohe = OneHotEncoder(drop="first")
scale = StandardScaler()

preprocesser = ColumnTransformer(
    transformers=[
        ("StandardScale", scale, [0, 1, 2, 3]),
        ("OneHotEncode", ohe, [4, 5]),
    ],
    remainder="passthrough",
)


X_train_dummy = preprocesser.fit_transform(X_train)
X_test_dummy = preprocesser.fit_transform(X_test)


models = {
    "LINEAR REGRESSION": LinearRegression(),
    "LASSO": Lasso(),
    "RIDGE": Ridge(),
    "DECISION TREE": DecisionTreeRegressor(),
    "KNN": KNeighborsRegressor(),
}

for name, md in models.items():
    md.fit(X_train_dummy, y_train)
    y_pred = md.predict(X_test_dummy)
    print(
        f"{name}\n MAE: {mean_absolute_error(y_test, y_pred)} score: {r2_score(y_test, y_pred)}"
    )

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train_dummy, y_train)
dtr.predict(X_test_dummy)

In [None]:
# Predictive System


def prediction(
    Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Area, Item
):
    features = np.array(
        [
            [
                Year,
                average_rain_fall_mm_per_year,
                pesticides_tonnes,
                avg_temp,
                Area,
                Item,
            ]
        ],
        dtype=object,
    )
    transform_features = preprocesser.transform(features)
    predicted_yeild = dtr.predict(transform_features).reshape(-1, 1)
    return predicted_yeild[0][0]

In [None]:
result = prediction(1990, 1485.0, 121.0, 16.37, "Albania", "Maize")
result

In [None]:
import pickle

pickle.dump(dtr, open("dtr.pkl", "wb"))
pickle.dump(preprocesser, open("preprocesser.pkl", "wb"))