Analyzing the Relationship Between Temperature and Crop Yield Using the Crop Yield Dataset

In [67]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [68]:
# Load dataset
df= pd.read_csv("https://raw.githubusercontent.com/Explore-AI/Public-Data/master/Data/Python/crop_yield_dataset.csv")

In [69]:
df.head()

Unnamed: 0,Temperature,Crop_Yield
0,27.483571,58.922301
1,24.308678,44.07042
2,28.238443,63.490857
3,32.615149,58.221043
4,23.829233,50.592752


In [70]:
# Check for null values
df.isnull().sum()

Temperature    0
Crop_Yield     0
dtype: int64

In [71]:
# summary statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temperature,120.0,24.604155,4.626583,11.901274,22.140623,24.63386,26.913931,37.316211
Crop_Yield,120.0,49.580131,10.975299,24.409459,42.875481,49.548066,56.533272,74.396332


In [72]:
# split data into features and target variable
X = df[['Temperature']]
y = df['Crop_Yield']

In [73]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [74]:
def train_linear_regression(X_train, y_train):
    """
    This function trains a linear regression model on the training data
    and returns the model
    """
    # Create a linear regression model
    model = LinearRegression()
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    return model

In [75]:
def evaluate_model_on_train(model, X_train, y_train):
    """
    Evaluates the performance of the trained linear regression model on the training set.
    """
    # Generate predictions on the training set
    y_train_pred = model.predict(X_train)

    # Calculate the Mean Squared Error (MSE)
    train_mse = mean_squared_error(y_train, y_train_pred)

    # Calculate the R-squared
    train_r2 = r2_score(y_train, y_train_pred)

    # Print the training MSE and R-squared score
    print("Training MSE:", train_mse)
    print("Training R-squared:", train_r2)



In [76]:
def evaluate_model_on_test(model, X_test, y_test):
    """
    Evaluates the performance of the trained linear regression model on the testing set.
    """
    # Generate predictions on the testing set
    y_test_pred = model.predict(X_test)

    # Calculate the Mean Squared Error (MSE)
    test_mse = mean_squared_error(y_test, y_test_pred)

    # Calculate the R-squared
    test_r2 = r2_score(y_test, y_test_pred)

    # Print the testing MSE and R-squared score
    print("Testing MSE:", test_mse)
    print("Testing R-squared:", test_r2)




In [77]:
# Prediction Function 
def predict(model, X_test):
    y_pred = model.predict(X_test)
    results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    print(results)
    return y_pred

In [78]:
    # Train the model
model = train_linear_regression(X_train, y_train)

# Evaluate on training data
evaluate_model_on_train(model, X_train, y_train)

# Evaluate on testing data
evaluate_model_on_test(model, X_test, y_test)

# Predict on test data
y_pred = predict(model, X_test)

Training MSE: 22.152323850480098
Training R-squared: 0.8025918031520605
Testing MSE: 37.75854546183867
Testing R-squared: 0.7167858892114612
        Actual  Predicted
44   40.031661  34.893598
47   70.055187  61.030246
4    50.592752  47.720170
55   63.448717  59.733104
26   32.305311  38.269661
64   57.007944  58.509020
73   59.417743  66.261622
10   37.612506  45.356983
40   52.511257  57.745640
107  46.357054  51.933247
18   44.987345  40.774116
62   33.582188  38.729988
11   45.685517  45.333151
36   61.417509  52.286654
89   74.396332  55.424356
91   65.364278  60.118249
109  52.653530  49.366386
0    58.922301  55.253730
88   47.277636  44.673145
104  46.027484  48.471268
65   67.132403  64.113461
45   44.865463  42.713817
31   70.255023  69.226471
70   51.381381  53.858908
