In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
import matplotlib.pyplot as plt

In [3]:
data = pd.read_excel(r"C:\Users\Priyanshul\OneDrive\Desktop\Hackathon\crop_yield_new\crop yield data sheet.xlsx")
data.head()

Unnamed: 0,Rain Fall (mm),Fertilizer,Temperatue,Nitrogen (N),Phosphorus (P),Potassium (K),Yeild (Q/acre)
0,1230.0,80.0,28,80.0,24.0,20.0,12.0
1,480.0,60.0,36,70.0,20.0,18.0,8.0
2,1250.0,75.0,29,78.0,22.0,19.0,11.0
3,450.0,65.0,35,70.0,19.0,18.0,9.0
4,1200.0,80.0,27,79.0,22.0,19.0,11.0


In [4]:
data.describe()

Unnamed: 0,Rain Fall (mm),Fertilizer,Nitrogen (N),Phosphorus (P),Potassium (K),Yeild (Q/acre)
count,99.0,99.0,99.0,99.0,99.0,99.0
mean,849.79798,67.808081,70.737374,21.131313,18.060606,9.050505
std,400.042676,10.028202,6.677079,1.951695,1.817254,1.965902
min,400.0,50.0,59.0,18.0,15.0,5.5
25%,450.0,60.0,65.0,19.5,16.0,7.0
50%,1150.0,70.0,71.0,21.0,19.0,9.0
75%,1237.5,77.0,77.0,23.0,19.0,11.0
max,1300.0,80.0,80.0,25.0,22.0,12.0


In [5]:
data.info()
data.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Rain Fall (mm)  99 non-null     float64
 1   Fertilizer      99 non-null     float64
 2   Temperatue      100 non-null    object 
 3   Nitrogen (N)    99 non-null     float64
 4   Phosphorus (P)  99 non-null     float64
 5   Potassium (K)   99 non-null     float64
 6   Yeild (Q/acre)  99 non-null     float64
dtypes: float64(6), object(1)
memory usage: 6.1+ KB


Index(['Rain Fall (mm)', 'Fertilizer', 'Temperatue', 'Nitrogen (N)',
       'Phosphorus (P)', 'Potassium (K)', 'Yeild (Q/acre)'],
      dtype='object')

In [6]:
data.dropna(inplace=True)

In [7]:
X = data.drop('Yeild (Q/acre)', axis=1)
y = data['Yeild (Q/acre)']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)

In [11]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1.1125
R-squared: 0.7623815244960619


In [12]:
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

model_tune = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(estimator=model_tune, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

#finding best parameters using GridSearchCV
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

In [13]:
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': 3, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [14]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [15]:
mse_tune = mean_squared_error(y_test, y_pred)
r2_tune = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1.1125
R-squared: 0.7623815244960619


In [16]:
#xx=[1230.0	,80.0,	28,	80.0,	24.0	,20.0]
#xx=[480.0,	60.0,	36,	70.0,	20.0,	18.0	]
xx=[1200.0,	80.0,	27	,79.0,	22.0,	19.0	]
xx=np.array(xx)
xx=xx.reshape(-1,6)
y_pred = best_model.predict(xx)



In [19]:
y_pred

array([10.88235294])

In [20]:
import pickle

# Assuming model is your trained model object
# Save the model to a .pkl file
with open("C:\\Users\\Priyanshul\\OneDrive\\Desktop\\Hackathon\\crop12.pkl", "wb") as f:
    pickle.dump(model, f)