In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import warnings
from pandas import read_csv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('clean_data.csv', na_values= ['n/a','\s+','N/A'], delimiter=',')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             120 non-null    int64  
 1   Location         120 non-null    object 
 2   Heat Wave        120 non-null    object 
 3   Dry Spell        120 non-null    object 
 4   Cold Wave        120 non-null    object 
 5   Wet Spell        120 non-null    object 
 6   Avg Min Temp *C  120 non-null    float64
 7   Yr Rain mm       120 non-null    float64
 8   Irrigation       120 non-null    object 
 9   Crop Type        120 non-null    object 
 10  Crop Damage      120 non-null    object 
 11  Yield            120 non-null    float64
 12  Observer         120 non-null    object 
dtypes: float64(3), int64(1), object(9)
memory usage: 12.3+ KB


In [4]:
df.head()

Unnamed: 0,Year,Location,Heat Wave,Dry Spell,Cold Wave,Wet Spell,Avg Min Temp *C,Yr Rain mm,Irrigation,Crop Type,Crop Damage,Yield,Observer
0,1980,Lake Meganhaven,N,N,Y,N,19.7,881.87,Y,Canola,Y,1595.59,Kiara Miller
1,1980,East Stevenside,N,N,Y,N,23.1,617.83,Y,Wheat,Y,1427.49,Eric Le
2,1980,Davidfurt,N,N,N,N,21.2,1033.66,N,Soy,N,795.43,Taylor Robinson
3,1981,Lake Meganhaven,N,N,Y,N,20.3,802.37,Y,Canola,Y,1621.85,Kevin Walters
4,1981,East Stevenside,N,N,Y,N,22.7,660.74,Y,Wheat,Y,1414.95,Eric Le


In [5]:
#drop Observer column becaue we assume it has no bearing on “Yield” targets
df.drop(['Observer'], axis=1, inplace=True)

In [6]:
#change categorical columns to nummerical columns by replace method
cat_feature= ['Heat Wave',
 'Dry Spell',
 'Cold Wave',
 'Wet Spell',
 'Irrigation',
 'Crop Damage']

for i in cat_feature:
   df[i+'_transform'] = df[i].replace(['Y', 'N'], [1, 0])

In [7]:
#feature encoding to change catagorical column to numerical column with OneHotEncoder 
ohe = OneHotEncoder(sparse = False)

In [17]:
#encode the Crop Type and Location column
columns_encode = ['Crop Type', 'Location']
transformed_columns = ohe.fit_transform(df[columns_encode])
encoded_df = pd.DataFrame(
    transformed_columns,
    columns=[f"{col}_{val}" for col, vals in zip(columns_encode, ohe.categories_) for val in vals]
)
encoded_df.head()

Unnamed: 0,Crop Type_Canola,Crop Type_Soy,Crop Type_Wheat,Location_Davidfurt,Location_East Stevenside,Location_Lake Meganhaven
0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0,0.0


In [18]:
df_transformed = pd.concat([df, encoded_df], axis=1)

In [19]:
df_transformed.head()

Unnamed: 0,Year,Location,Heat Wave,Dry Spell,Cold Wave,Wet Spell,Avg Min Temp *C,Yr Rain mm,Irrigation,Crop Type,...,Cold Wave_transform,Wet Spell_transform,Irrigation_transform,Crop Damage_transform,Crop Type_Canola,Crop Type_Soy,Crop Type_Wheat,Location_Davidfurt,Location_East Stevenside,Location_Lake Meganhaven
0,1980,Lake Meganhaven,N,N,Y,N,19.7,881.87,Y,Canola,...,1,0,1,1,1.0,0.0,0.0,0.0,0.0,1.0
1,1980,East Stevenside,N,N,Y,N,23.1,617.83,Y,Wheat,...,1,0,1,1,0.0,0.0,1.0,0.0,1.0,0.0
2,1980,Davidfurt,N,N,N,N,21.2,1033.66,N,Soy,...,0,0,0,0,0.0,1.0,0.0,1.0,0.0,0.0
3,1981,Lake Meganhaven,N,N,Y,N,20.3,802.37,Y,Canola,...,1,0,1,1,1.0,0.0,0.0,0.0,0.0,1.0
4,1981,East Stevenside,N,N,Y,N,22.7,660.74,Y,Wheat,...,1,0,1,1,0.0,0.0,1.0,0.0,1.0,0.0


In [20]:
#drop unused columns
df_transformed.drop(['Heat Wave','Dry Spell','Cold Wave','Wet Spell','Irrigation','Crop Damage','Location','Crop Type'], axis=1, inplace=True)

In [21]:
df_transformed.head()

Unnamed: 0,Year,Avg Min Temp *C,Yr Rain mm,Yield,Heat Wave_transform,Dry Spell_transform,Cold Wave_transform,Wet Spell_transform,Irrigation_transform,Crop Damage_transform,Crop Type_Canola,Crop Type_Soy,Crop Type_Wheat,Location_Davidfurt,Location_East Stevenside,Location_Lake Meganhaven
0,1980,19.7,881.87,1595.59,0,0,1,0,1,1,1.0,0.0,0.0,0.0,0.0,1.0
1,1980,23.1,617.83,1427.49,0,0,1,0,1,1,0.0,0.0,1.0,0.0,1.0,0.0
2,1980,21.2,1033.66,795.43,0,0,0,0,0,0,0.0,1.0,0.0,1.0,0.0,0.0
3,1981,20.3,802.37,1621.85,0,0,1,0,1,1,1.0,0.0,0.0,0.0,0.0,1.0
4,1981,22.7,660.74,1414.95,0,0,1,0,1,1,0.0,0.0,1.0,0.0,1.0,0.0


In [22]:
dataset_a = df_transformed.drop(columns=['Yield'])  
target_a = df_transformed['Yield']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(dataset_a, target_a, test_size=0.2, random_state=42)

X_train_a.to_csv('a_X_train.csv', index=False)
y_train_a.to_csv('a_y_train.csv', index=False)
X_test_a.to_csv('a_X_test.csv', index=False)
y_test_a.to_csv('a_y_test.csv', index=False)

In [23]:
dataset_b = df_transformed.drop(columns=['Crop Damage_transform'])  
target_b = df_transformed['Crop Damage_transform']

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(dataset_b, target_b, test_size=0.2, random_state=42)

X_train_b.to_csv('b_X_train.csv', index=False)
y_train_b.to_csv('b_y_train.csv', index=False)
X_test_b.to_csv('b_X_test.csv', index=False)
y_test_b.to_csv('b_y_test.csv', index=False)