In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('dataset2.csv')

In [4]:
# Convert the data types of the columns as needed
df['Fats (g)'] = df['Fats (g)'].astype(float)
df['Sugars (g)'] = df['Sugars (g)'].astype(float)
df['Carbohydrates (g)'] = df['Carbohydrates (g)'].astype(float)
df['Fiber (g)'] = df['Fiber (g)'].astype(float)
df['Protein (g)'] = df['Protein (g)'].astype(float)
df['Salts (g)'] = df['Salts (g)'].astype(float)
df['Energy (kcal)'] = df['Energy (kcal)'].astype(float)
df['target'] = df['target'].astype(float)

In [5]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder object
one_hot_encoder = OneHotEncoder(sparse=False)

# Fit and transform the 'Product Name' column
one_hot_encoded = one_hot_encoder.fit_transform(df[['Product Name']])

# Create a DataFrame from the one-hot encoded array
feature_names = one_hot_encoder.categories_[0]
one_hot_df = pd.DataFrame(one_hot_encoded, columns=feature_names)

# Concatenate the original DataFrame with the one-hot encoded DataFrame
df_encoded = pd.concat([df.drop('Product Name', axis=1), one_hot_df], axis=1)

# Now 'Product Name' is one-hot encoded



In [6]:
df.to_csv('dataset2.csv.csv', index=False)

In [7]:
df.info()

df.groupby(['Allergies', 'Diseases']).size()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Product Name       48 non-null     object 
 1   Fats (g)           48 non-null     float64
 2   Sugars (g)         48 non-null     float64
 3   Carbohydrates (g)  48 non-null     float64
 4   Fiber (g)          48 non-null     float64
 5   Protein (g)        48 non-null     float64
 6   Salts (g)          48 non-null     float64
 7   Energy (kcal)      48 non-null     float64
 8   Allergies          48 non-null     int64  
 9   Diseases           48 non-null     int64  
 10  target             48 non-null     float64
dtypes: float64(8), int64(2), object(1)
memory usage: 4.3+ KB


Allergies  Diseases
0          3            1
1          2            1
2          3            1
3          3            3
4          3            3
5          3            2
6          0            1
           1            2
           3           26
           4            1
7          3            3
8          3            2
9          3            2
dtype: int64

In [8]:
# Check if there are any non-numeric columns in the DataFrame
non_numeric_columns = df.select_dtypes(exclude=['int64', 'float64']).columns

if non_numeric_columns.empty:
    print("All data is properly encoded.")
else:
    print(f"The following columns have non-numeric data: {non_numeric_columns}")

The following columns have non-numeric data: Index(['Product Name'], dtype='object')


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
# One-hot encode the categorical features
df_encoded = pd.get_dummies(df)

# Now split the data into features and target variable
X = df_encoded.drop('target', axis=1)
y = df_encoded['target']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
reg = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
reg.fit(X_train, y_train)

# Make predictions
y_pred = reg.predict(X_test)

# Calculate the root mean squared error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 1.7831965679643957


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400,500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10,20],
    'min_samples_leaf': [1, 2, 4, 8, 16],
    'bootstrap': [True, False]
}

# Create a base model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

print(f'Best parameters: {best_params}')

In [None]:
# Create a new model with the best parameters
best_rf = RandomForestRegressor(
    bootstrap=True,
    max_depth=20,
    min_samples_leaf=2,
    min_samples_split=10,
    n_estimators=100,
    random_state=42  # for reproducibility
)



# Train the model
best_rf.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = best_rf.predict(X_train)

# Make predictions on the test set
y_test_pred = best_rf.predict(X_test)

# Calculate the root mean squared error for the training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

# Calculate the root mean squared error for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f'Training RMSE: {rmse_train}')
print(f'Test RMSE: {rmse_test}')

Training RMSE: 1.461768336620691
Test RMSE: 1.7356853836232446


In [13]:
# Create a new model with the best parameters
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

best_rf = RandomForestRegressor(
    bootstrap=False,
    max_depth=None,
    min_samples_leaf=8,
    min_samples_split=20,
    n_estimators=100,
    random_state=42  # for reproducibility
)



# Train the model
best_rf.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = best_rf.predict(X_train)

# Make predictions on the test set
y_test_pred = best_rf.predict(X_test)

# Calculate the root mean squared error for the training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

# Calculate the root mean squared error for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'Root Mean Squared Error: {rmse}')

print(f'Training RMSE: {rmse_train}')
print(f'Test RMSE: {rmse_test}')

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)

# Print the R^2 score
print("R^2 score:", r2)

Root Mean Squared Error: 1.7831965679643957
Training RMSE: 1.6211492836873476
Test RMSE: 1.7684032345593588
R^2 score: 0.2773204545454546


In [None]:
# Assuming new_data is a DataFrame with the same structure as your training data
new_data = pd.DataFrame({
    'Product Name': ['Almonds', 'Apple', 'Avocado', 'Black beans (canned)', 'Blackberries'],
    'Fats (g)': [2.5, 0.3, 15, 0.5, 0.5],
    'Carbohydrates (g)': [23.4, 11.4, 8.5, 7.5, 10.2],
    'Sugars (g)': [2.5, 10.4, 0.7, 0.0, 4.9],
    'Protein (g)': [1.5, 0.3, 2.0, 1.5, 1.4],
    'Energy (kcal)': [100, 52, 160, 90, 43],
    'Allergies': [0, 0, 0, 0, 0],
    'Diseases': [0, 0, 0, 0, 0],
    'Fiber (g)': [0, 2.4, 6.7, 6.0, 5.3],
})

# One-hot encode the 'Product Name' column
new_data_encoded = pd.get_dummies(new_data, columns=['Product Name'])

# Make sure the new data has the same columns as the training data
missing_cols = set(X_train.columns) - set(new_data_encoded.columns)
for c in missing_cols:
    new_data_encoded[c] = 0
new_data_encoded = new_data_encoded[X_train.columns]

# Now you can use the encoded data to make a prediction
new_pred = best_rf.predict(new_data_encoded)

print(f'Prediction for the new data: {new_pred}')

Prediction for the new data: [7.9 7.9 7.9 7.9 7.9]
