In [69]:
#!pip install scikit-learn

In [70]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.datasets import make_regression
import pandas as pd
import numpy as np
import csv, sklearn, os

filename = "Wheat.csv"

def load_csv(csv_file):
    data_dir = os.path.join(os.getcwd(), 'data', csv_file)

    with open(data_dir, 'r') as file:
        df = pd.read_csv(file)
        df.rename(columns={'value': 'temperature'}, inplace=True)
        return df

df=load_csv(filename)

def normalize_column(data, column_name):
    """
    Normalizes the specified column in the DataFrame using Min-Max scaling.

    Parameters:
    - data: Pandas DataFrame containing the data.
    - column_name: The name of the column to be normalized.

    Returns:
    - A Pandas DataFrame with the specified column normalized.
    """
    min_val = data[column_name].min()
    max_val = data[column_name].max()
    data[column_name + '_normalized'] = (data[column_name] - min_val) / (max_val - min_val)
    return data

# Normalize the 'price', 'production', and 'value' (temperature) columns
norm_df= df.copy()
norm_df['priceXproduction'] = norm_df['price'] * norm_df['production']
norm_df['price2'] = norm_df['price'] ** 2
norm_df['production2'] = norm_df['production'] ** 2
norm_df['temperature2'] = norm_df['temperature'] ** 2
norm_df['priceXproduction2'] = norm_df['priceXproduction'] ** 2
norm_df['price3'] = norm_df['price'] ** 3
norm_df['production3'] = norm_df['production'] ** 3
norm_df['temperature3'] = norm_df['temperature'] ** 3
norm_df['priceXproduction3'] = norm_df['priceXproduction'] ** 3
norm_cols = norm_df.columns[1:]
for column in norm_cols:
    norm_df = normalize_column(norm_df, column)
    
norm_df = norm_df.filter(like='_normalized')
norm_df['weighted_score'] = norm_df.mean(axis=1)

norm_df.head()



Unnamed: 0,temperature_normalized,production_normalized,price_normalized,priceXproduction_normalized,price2_normalized,production2_normalized,temperature2_normalized,priceXproduction2_normalized,price3_normalized,production3_normalized,temperature3_normalized,priceXproduction3_normalized,weighted_score
0,0.0,0.616553,0.224839,0.634733,0.201437,0.445617,0.002235,0.464424,0.179461,0.309059,0.0,0.326993,0.283779
1,0.042947,1.0,0.162741,1.0,0.144445,1.0,0.0,1.0,0.127467,1.0,0.000106,1.0,0.539809
2,0.297433,0.182375,1.0,0.281432,1.0,0.074561,0.069785,0.132881,1.0,0.026859,0.018541,0.056574,0.345037
3,0.511454,0.081073,0.794433,0.13984,0.772505,0.027207,0.238462,0.051482,0.74969,0.007854,0.116544,0.016481,0.292252
4,0.747821,0.423607,0.956103,0.571659,0.950467,0.247069,0.541523,0.391788,0.944442,0.133999,0.398564,0.256058,0.546925


In [71]:
# Calculate the correlation matrix
corr_matrix = norm_df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# Drop highly correlated columns
norm_df_reduced = norm_df.drop(to_drop, axis=1)

norm_df_reduced

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


Unnamed: 0,temperature_normalized,production_normalized,price_normalized,weighted_score
0,0.0,0.616553,0.224839,0.283779
1,0.042947,1.0,0.162741,0.539809
2,0.297433,0.182375,1.0,0.345037
3,0.511454,0.081073,0.794433,0.292252
4,0.747821,0.423607,0.956103,0.546925
5,0.937377,0.120499,0.978587,0.504145
6,1.0,0.398637,0.109208,0.396515
7,0.990388,0.0,0.0,0.245038
8,0.856253,0.063448,0.063169,0.21141
9,0.567164,0.05292,0.388651,0.190379


In [72]:
corr_matrix

Unnamed: 0,temperature_normalized,production_normalized,price_normalized,priceXproduction_normalized,price2_normalized,production2_normalized,temperature2_normalized,priceXproduction2_normalized,price3_normalized,production3_normalized,temperature3_normalized,priceXproduction3_normalized,weighted_score
temperature_normalized,1.0,0.530007,0.158106,0.506351,0.169249,0.536337,0.963445,0.5199,0.179416,0.50966,0.920488,0.501434,0.247581
production_normalized,0.530007,1.0,0.15937,0.988079,0.16974,0.972608,0.446333,0.978231,0.178869,0.90909,0.390238,0.926599,0.518812
price_normalized,0.158106,0.15937,1.0,0.016332,0.999605,0.22309,0.005095,0.117209,0.998486,0.24198,0.083132,0.170415,0.513239
priceXproduction_normalized,0.506351,0.988079,0.016332,1.0,0.02641,0.946473,0.445973,0.972606,0.035356,0.875405,0.402228,0.908662,0.600995
price2_normalized,0.169249,0.16974,0.999605,0.02641,1.0,0.232039,0.009053,0.125849,0.999637,0.249196,0.067956,0.177492,0.513975
production2_normalized,0.536337,0.972608,0.22309,0.946473,0.232039,1.0,0.446104,0.990094,0.239853,0.980587,0.389049,0.984068,0.491842
temperature2_normalized,0.963445,0.446333,0.005095,0.445973,0.009053,0.446104,1.0,0.44698,0.02232,0.421921,0.990016,0.425424,0.235233
priceXproduction2_normalized,0.5199,0.978231,0.117209,0.972606,0.125849,0.990094,0.44698,1.0,0.133432,0.9602,0.400281,0.98018,0.563007
price3_normalized,0.179416,0.178869,0.998486,0.035356,0.999637,0.239853,0.02232,0.133432,1.0,0.25543,0.053577,0.183629,0.514683
production3_normalized,0.50966,0.90909,0.24198,0.875405,0.249196,0.980587,0.421921,0.9602,0.25543,1.0,0.36818,0.992771,0.470219


In [73]:
# 1. Prepare the data
X = norm_df_reduced.drop(columns=['weighted_score'])  # Features (remove the target column and any non-feature columns)
y = norm_df_reduced['weighted_score']  # Target variable

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 3. Create the Random Forest model
rf_model = RandomForestRegressor(n_estimators=1000) # Use RandomForestRegressor if it's a regression problem

# 4. Train the model
rf_model.fit(X_train, y_train)

# 5. Evaluate the model
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2s = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse} \nRoot Mean Squared Error: {rmse} \nR^2 Score: {r2s}")

Mean Squared Error: 0.03941031700558489 
Root Mean Squared Error: 0.19852031887337096 
R^2 Score: 0.12490911845013708


In [74]:
X = norm_df_reduced.drop(columns=['weighted_score'])  # Features (remove the target column and any non-feature columns)
y = norm_df_reduced['weighted_score']  # Target variable

# Create a random forest regressor model
model = RandomForestRegressor(n_estimators=1000)

# Configure the cross-validation procedure
cv = KFold(n_splits=3, shuffle=True)


# Define multiple scoring metrics
scoring = {'MSE': 'neg_mean_squared_error', 'R2': 'r2'}

# Execute the cross-validation procedure using mean squared error
scores = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1)

# Convert scores to positive as cross_val_score returns negative values for MSE to optimize towards zero
mse_scores = -scores['test_MSE']

# Report performance
print(f'Mean Squared Error: {mse_scores.mean():.3f} (+/- {mse_scores.std():.3f})')
print(f'Root Mean Squared Error: {np.sqrt(mse_scores).mean():.3f} (+/- {np.sqrt(mse_scores).std():.3f})')
print(f"R^2 Score: {scores['test_R2'].mean():.3f} (std: {scores['test_R2'].std():.3f})")

Mean Squared Error: 0.020 (+/- 0.013)
Root Mean Squared Error: 0.135 (+/- 0.047)
R^2 Score: -0.049 (std: 0.569)


In [92]:
X = norm_df_reduced.drop(columns=['weighted_score'])  # Features (remove the target column and any non-feature columns)
y = norm_df_reduced['weighted_score']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create a random forest regressor model
rf = RandomForestRegressor()

# Define a grid of parameters to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Define multiple scoring metrics
scoring = {'MSE': 'neg_mean_squared_error', 'R2': 'r2'}

# Set up the grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="r2")

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Print out the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Get the best estimator and evaluate it on the test set
best_rf = grid_search.best_estimator_
best_rf_score = -best_rf.score(X_test, y_test)
print("Test set score of best estimator: ", best_rf_score)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters found:  {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Test set score of best estimator:  -2.525622023094225


In [90]:
rmse = np.sqrt(best_rf_score)
rmse

1.4625425400298087

In [None]:
predictions = best_rf.predict(X_train)
predictions, X_train, y_train, X_test, y_test

(array([0.26845526, 0.45561831, 0.17503327, 0.45136418, 0.33351356,
        0.22772309, 0.25452028, 0.28725225, 0.28413144, 0.426557  ]),
     temperature_normalized  production_normalized  price_normalized
 12                0.050866               0.616553          0.224839
 4                 0.747821               0.423607          0.956103
 11                0.007045               0.046697          0.039615
 5                 0.937377               0.120499          0.978587
 2                 0.297433               0.182375          1.000000
 9                 0.567164               0.052920          0.388651
 0                 0.000000               0.616553          0.224839
 10                0.252539               0.509090          0.304069
 3                 0.511454               0.081073          0.794433
 6                 1.000000               0.398637          0.109208,
 12    0.287843
 4     0.546925
 11    0.020180
 5     0.504145
 2     0.345037
 9     0.190379
 0    

In [None]:
predictions = best_rf.predict(X_train)
predictions, y_train

(array([0.26845526, 0.45561831, 0.17503327, 0.45136418, 0.33351356,
        0.22772309, 0.25452028, 0.28725225, 0.28413144, 0.426557  ]),
 12    0.287843
 4     0.546925
 11    0.020180
 5     0.504145
 2     0.345037
 9     0.190379
 0     0.283779
 10    0.275468
 3     0.292252
 6     0.396515
 Name: weighted_score, dtype: float64)

In [None]:
df=load_csv("Soybeans.csv")
for column in df.columns[1:]:
    df = normalize_column(df, column)
    
df = df.filter(like='_normalized')
#df['weighted_score'] = df.mean(axis=1)

df["predictions"] = best_rf.predict(df)
df['actual_weighted_score'] = df.mean(axis=1)

df

Unnamed: 0,temperature_normalized,production_normalized,price_normalized,predictions,actual_weighted_score
0,0.049843,0.668476,0.0,0.268455,0.246694
1,0.091407,1.0,0.456452,0.273697,0.455389
2,0.307906,0.037181,0.832258,0.233417,0.35269
3,0.480675,0.106476,0.725806,0.305337,0.404574
4,0.730981,0.0,0.856452,0.347478,0.483728
5,0.936296,0.132398,1.0,0.456857,0.631388
6,1.0,0.372211,0.612903,0.431141,0.604064
7,0.955764,0.118434,0.667742,0.423952,0.541473
8,0.844343,0.200646,0.509677,0.425096,0.494941
9,0.57029,0.120742,0.182258,0.304727,0.294504


In [100]:
seed = 7

df_soy=load_csv("Soybeans.csv")
df_corn=load_csv("Corn.csv")
df_wheat=load_csv("Wheat.csv")
df_animal_products=load_csv("Animal_Products.csv")
df_vegetables=load_csv("Vegetables.csv")
df_fruit=load_csv("Fruit.csv")
df_fish=load_csv("Fish.csv")
df_nuts=load_csv("Nuts.csv")

train_df = pd.concat([df_soy, df_corn, df_wheat, df_animal_products, df_vegetables, df_fruit, df_fish, df_nuts], ignore_index=True)
train_df.drop(columns=['date'], inplace=True)


train_df['priceXproduction'] = train_df['price'] * train_df['production']
train_df['price2'] = train_df['price'] ** 2
train_df['production2'] = train_df['production'] ** 2
train_df['temperature2'] = train_df['temperature'] ** 2
train_df['priceXproduction2'] = train_df['priceXproduction'] ** 2
train_df['price3'] = train_df['price'] ** 3
train_df['production3'] = train_df['production'] ** 3
train_df['temperature3'] = train_df['temperature'] ** 3
train_df['priceXproduction3'] = train_df['priceXproduction'] ** 3


for col in train_df.columns:
    train_df = normalize_column(train_df, col)
train_df = train_df.filter(like='_normalized')
train_df['weighted_score'] = train_df.mean(axis=1)


corr_matrix = train_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
train_df_reduced = train_df.drop(to_drop, axis=1)

X = train_df_reduced.drop(columns=['weighted_score'])
y = train_df_reduced['weighted_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

rf = RandomForestRegressor(random_state=seed)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
#mse = make_scorer(mean_squared_error,greater_is_better=False)
cv = KFold(n_splits=5, shuffle=True, random_state=seed)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=2, scoring="r2")

grid_search.fit(X_train, y_train)


print("Best parameters found: ", grid_search.best_params_)

best_rf = grid_search.best_estimator_
best_rf_score = best_rf.score(X_test, y_test)
print("Test set score of best estimator: ", best_rf_score)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters found:  {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Test set score of best estimator:  0.9538114748105538


In [None]:
predictions = best_rf.predict(X_train)
predictions, y_train

(array([0.07024087, 0.20268178, 0.29650004, 0.23265809, 0.32050848,
        0.05648228, 0.68093856, 0.13545592, 0.12850788, 0.07215858,
        0.16088167, 0.16325319, 0.09280904, 0.1760565 , 0.61038681,
        0.32175885, 0.35676043, 0.21065099, 0.27955626, 0.33062957,
        0.1361623 , 0.15234897, 0.15154032, 0.09748629, 0.08344328,
        0.16425968, 0.17257984, 0.08553895, 0.24235581, 0.26336236,
        0.11198769, 0.28692922, 0.38381554, 0.08802235, 0.23984536,
        0.15336905, 0.16190405, 0.23390198, 0.11153228, 0.13479666,
        0.21624766, 0.25867832, 0.0840534 , 0.18449837, 0.26596031,
        0.1823874 , 0.22778317, 0.27220951, 0.06618413, 0.11762788,
        0.0728097 , 0.16984447, 0.16935557, 0.11313198, 0.47471138,
        0.08085396, 0.24573413, 0.17214229, 0.34896132, 0.32883349,
        0.09318417, 0.16688742, 0.1403467 , 0.32184285, 0.31678009,
        0.19495261, 0.05182626, 0.12224666, 0.26522466, 0.11527012,
        0.20852616, 0.1037261 , 0.67124227, 0.14

In [None]:
predictions = best_rf.predict(X_test)
predictions, y_test

(array([0.24132384, 0.28668003, 0.13977909, 0.08165749, 0.29977554,
        0.05800647, 0.09100986, 0.18153106, 0.29679016, 0.11581563,
        0.1009102 , 0.49319081, 0.06147618, 0.14153214, 0.1412348 ,
        0.14295407, 0.21251632, 0.16386876, 0.17528166, 0.32508692,
        0.3278269 ]),
 22     0.232628
 46     0.332434
 80     0.134612
 93     0.073851
 43     0.268878
 103    0.052760
 78     0.086245
 26     0.178760
 30     0.352796
 88     0.122900
 95     0.100653
 73     0.431646
 91     0.059550
 13     0.140882
 12     0.129572
 51     0.145900
 15     0.208457
 11     0.145016
 37     0.161883
 20     0.324003
 18     0.341519
 Name: weighted_score, dtype: float64)

In [None]:
filename = "Wheat.csv"

def predict_weighted_score(csv_file):
    df=load_csv(csv_file)
    for column in df.columns[1:]:
        df = normalize_column(df, column)
        
    df = df.filter(like='_normalized')

    df["predictions"] = best_rf.predict(df)
    df['actual_weighted_score'] = df.mean(axis=1)
    return df

df_wheat_weighted = predict_weighted_score(filename)
df_wheat_weighted

Unnamed: 0,temperature_normalized,production_normalized,price_normalized,predictions,actual_weighted_score
0,0.0,0.616553,0.224839,0.253672,0.273766
1,0.042947,1.0,0.162741,0.523014,0.432175
2,0.297433,0.182375,1.0,0.234033,0.42846
3,0.511454,0.081073,0.794433,0.243892,0.407713
4,0.747821,0.423607,0.956103,0.301633,0.607291
5,0.937377,0.120499,0.978587,0.376986,0.603362
6,1.0,0.398637,0.109208,0.260099,0.441986
7,0.990388,0.0,0.0,0.237793,0.307045
8,0.856253,0.063448,0.063169,0.239793,0.305666
9,0.567164,0.05292,0.388651,0.13838,0.286779


In [None]:
AvgWeighted = {}
for csv in ["Corn.csv", "Soybeans.csv", "Wheat.csv", "Animal_Products.csv", "Corn.csv", "Fish.csv", "Fruit.csv", "Nuts.csv", "Soybeans.csv", "Vegetables.csv", "Wheat.csv"]:
    df_modeled = predict_weighted_score(csv)
    AvgWeighted[csv[:-4]] = df_modeled['predictions'].mean()
AvgWeighted

{'Corn': 0.26061095473868,
 'Soybeans': 0.30262461570911575,
 'Wheat': 0.2539458477219761,
 'Animal_Products': 0.3202712022131606,
 'Fish': 0.3601322348478091,
 'Fruit': 0.34098940106899717,
 'Nuts': 0.3176466694389095,
 'Vegetables': 0.31151827964578904}

In [None]:
final_rankings = [key for key, val in sorted(AvgWeighted.items(), key=lambda item: item[1], reverse=True)]
final_rankings

['Fish',
 'Fruit',
 'Animal_Products',
 'Nuts',
 'Vegetables',
 'Soybeans',
 'Corn',
 'Wheat']