<a href="https://colab.research.google.com/github/AndreGulyi/ML_projects/blob/main/ag_predicting__price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install xgboost
!pip install lightgbm
!pip install google.colab
!pip install tensorflow

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import sklearn
from sklearn import tree
from sklearn import svm
from sklearn import neighbors
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
url='https://drive.google.com/file/d/1CF5wtIXc3Pi-BW0F6_XjblPfZ-iy28xP/view?usp=share_link"'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head(10)

Data Manipulation

In [None]:
df["loc1"].value_counts()

In [None]:
df["loc2"].value_counts()

In [None]:
df = df[(df["loc1"].str.contains("S") == False)&(df["loc1"].str.contains("T") == False)]
df.shape

In [None]:
df["loc1"] = pd.to_numeric(df["loc1"], errors="coerce")
df["loc2"] = pd.to_numeric(df["loc2"], errors="coerce")
df.dropna(inplace=True)
df.shape

Data Type Changing

In [None]:
days_dummies = pd.get_dummies(df.dow)
days_dummies.head()

In [None]:
df2 = df.copy(deep=True)
df2.drop(columns = 'dow', inplace =True)

In [None]:
result = df2.join(days_dummies)
result.head()

Checking Outliers and Correlations

In [None]:
from pandas.plotting import scatter_matrix
_ = scatter_matrix(result.iloc[:,0:7], figsize=(12, 8))

In [None]:
pd.DataFrame((result.corr()['price'])).sort_values(by='price', ascending = False).round(2)

In [None]:
result.iloc[:,0:6]

In [None]:
result.iloc[:,0:6].hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
result.iloc[:,0:6].describe()

In [None]:
result['para1'].value_counts()

In [None]:
#just delete outliers
result = result[result['para1']<10]

# Feature Selection

5 best params

In [None]:
five_best = []
df_5 = pd.DataFrame(result.corr()['price']).sort_values(by='price', ascending=False)
df_5 = df_5.drop(df_5.index[0]).head(5)
df_5

In [None]:
for i in range(len(df_5)):
  five_best.append(df_5.index[i])

In [None]:
five_best

3 best params

In [None]:
three_best = []
df_3 = pd.DataFrame(result.corr()['price']).sort_values(by='price', ascending=False)
df_3 = df_3.drop(df_3.index[0]).head(3)
for i in range(len(df_3)):
  three_best.append(df_3.index[i])

three_best

# Machine Learning
Regression Models

In [None]:
feature_sets = {
    "full_dataset": result.drop(columns=['price']),
    "three_best": result[three_best],
    "five_best": result[five_best],  
}

In [None]:
regression_models = {
    'Linear':linear_model.LinearRegression(),
    'Lasso':linear_model.Lasso(random_state=8),
    'LassoCV':linear_model.LassoCV(random_state=8),
    'ElasticNet':linear_model.ElasticNet(random_state=8),
    'LassoLars':linear_model.LassoLars(random_state=8),
    'BayesianRidge':linear_model.BayesianRidge(),
    'Ridge':linear_model.Ridge(random_state=8),
    'DecisionTree':tree.DecisionTreeRegressor(random_state=8, max_depth=5),
    'RandomForest':RandomForestRegressor(random_state=8),
    'XGBoost': XGBRegressor(random_state=8),
    'LGMB': LGBMRegressor(random_state=8),
    'MLP':MLPRegressor(random_state=8),
}

In [None]:
def make_regression(x_train, y_train, x_test, y_test, model, model_name, verbose=True):

    model.fit(x_train,y_train)
    
    y_predict = model.predict(x_train)
    train_error = mean_squared_error(y_train, y_predict, squared=False)
    
    y_predict = model.predict(x_test)
    test_error = mean_squared_error(y_test, y_predict, squared=False)
    
    y_predict = model.predict(x_train)
    r2 = r2_score(y_train, y_predict)
    
    if verbose:
        print("----Model name = {}-----".format(model_name))
        print("Train error = "'{}'.format(train_error.round(1)))
        print("Test error = "'{}'.format(test_error.round(1)))
        print("r2_score = "'{}'.format(r2.round(2)))
        print("--------------------------------")
    
    trained_model = model
    
    return trained_model, y_predict, train_error, test_error, r2

In [None]:
pred_dict = {
    "regression_model": [],
    "feature_set": [],
    "Train Error": [],
    "Test Error": [],
    "R2" : []
}

In [None]:
for feature_set_name in feature_sets.keys():
    
    feature_set = feature_sets[feature_set_name]
    print("Included columns are {}".format(feature_set_name))
    for model_name in regression_models.keys():        
        
        y = result["price"]
        x = feature_set
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=8)
    

        trained_model, y_predict, train_error, test_error, r2 = make_regression(x_train, y_train, x_test, y_test, regression_models[model_name], model_name, verbose=True)


        pred_dict["regression_model"].append(model_name)
        pred_dict["feature_set"].append(feature_set_name)
        pred_dict["Train Error"].append(train_error)
        pred_dict["Test Error"].append(test_error)
        pred_dict["R2"].append(r2)

In [None]:
pred_df = pd.DataFrame(pred_dict)
pred_df.head(30)

In [None]:
pred_df['feature_set_2'] = pred_df['feature_set'].apply(lambda x: x.split('_')[0])
pred_df.head(5)

In [None]:
pred_df["Model_with_Dataset"] = pred_df['regression_model']+"_"+pred_df['feature_set_2']
pred_df.head(5)

In [None]:
df_show = pred_df[['Train Error', 'Test Error', 'R2', "Model_with_Dataset"]]
df_train_error = df_show[['Model_with_Dataset','Train Error']]
df_test_error = df_show[['Model_with_Dataset','Test Error']]

In [None]:
# Create a figure and subplots
fig, (ax2, ax3, ax4) = plt.subplots(1, 3, figsize=(14, 6))


# Create the first graph
df_show.plot(kind='barh', x='Model_with_Dataset', y='R2', color='red', ax=ax2, legend=False)
ax2.set_xlabel('R Squared')
ax2.set_ylabel('Model')
ax2.set_title('R-squared')

# Create the second graph
df_train_error.plot(kind='barh', x='Model_with_Dataset', y='Train Error', color='blue', ax=ax3, legend=False)
ax3.set_xlabel('Train Error')
ax3.set_ylabel('Model')
ax3.set_title('Train Error')


# Create the second graph
df_test_error.plot(kind='barh', x='Model_with_Dataset', y='Test Error', color='green', ax=ax4, legend=False)
ax4.set_xlabel('Test Error')
ax4.set_ylabel('Model')
ax4.set_title('Test Error')

# Fit the figure
plt.tight_layout()

# Show the figure
plt.show()

In [None]:
pred_df.drop(columns=['feature_set_2','Model_with_Dataset'], inplace=True)

#Model Evaluation

Highest R Squared

In [None]:
pred_df.sort_values(by='R2', ascending=False).head(10)

Min Test Error

In [None]:
pred_df.sort_values(by='Test Error', ascending=True).head(5)

Min Train Error

In [None]:
pred_df.sort_values(by='Train Error', ascending=True).head(5)

#Deep Learning

In [None]:
y = result['price']
x = result.drop(columns=['price'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=8)

In [None]:
import tensorflow as tf
from tensorflow.keras import regularizers
import time

In [None]:
start = time.time()

tf.random.set_seed(42)

# Define a new model with more layers
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=64, activation='relu', input_shape=[x_train.shape[1]], kernel_regularizer=regularizers.l2(0.01)),
    tf.keras.layers.Dense(units=32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    tf.keras.layers.Dense(units=16, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    tf.keras.layers.Dense(units=8, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    tf.keras.layers.Dense(units=1)
])

# Compile the model with a lower learning rate
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
              loss=tf.losses.MeanSquaredError(),
              metrics=[tf.metrics.MeanAbsoluteError()])


# Fit the model to the training data
history = model.fit(x_train, y_train, epochs=1000, batch_size=32, verbose = 0,
                    validation_data=(x_test, y_test))

y_train_pred = model.predict(x_train)
r2_second = r2_score(y_train, y_train_pred)

# Select the MAE and val_MAE for the four desired epochs
epochs_to_plot = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
mae_second = [history.history['mean_absolute_error'][epoch - 1] for epoch in epochs_to_plot]
val_mae_second = [history.history['val_mean_absolute_error'][epoch - 1] for epoch in epochs_to_plot]

# Plot the MAE
plt.plot(epochs_to_plot, mae_second, 'b', label=f'Training MAE: {mae_second[-1]:.3f}')

# Plot the val_MAE
plt.plot(epochs_to_plot, val_mae_second, 'r', label=f'Test MAE: {val_mae_second[-1]:.3f}')
plt.legend()
plt.show()

end = time.time()
elapsed_time_seconds = end - start
elapsed_time = (elapsed_time_seconds) / 60
elapsed_time_seconds = round(elapsed_time_seconds,2)
elapsed_time = round(elapsed_time,2)
print('Execution time:', elapsed_time_seconds, 'seconds which is:', elapsed_time, 'minutes.' )