In [None]:
# starter notebook content:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/Etie043/Schulich_data_science-/refs/heads/main/Recipe%20Reviews%20and%20User%20Feedback%20Dataset.csv')

# Aggregate data by recipe -- EXAMPLE (adjust as needed)
agg_data = data.groupby('recipe_code').agg({
    'stars': 'mean',  # Average rating
    'comment_id': 'count',  # Count of comments as a proxy for engagement
    'best_score': 'mean'  # Average best score
}).reset_index()

# Rename columns to reflect theiry_predeaning
agg_data.rename(columns={
    'stars': 'avg_stars', 
    'comment_id': 'comment_count', 
    'best_score': 'avg_best_score'
}, inplace=True)

# Create a new 'popularity' score, possibly weighted
agg_data['popularity'] = agg_data['avg_stars'] * 0.5 + agg_data['comment_count'] * 0.3 + agg_data['avg_best_score'] * 0.2

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(agg_data[['avg_stars', 'comment_count', 'avg_best_score']])
y = agg_data['popularity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 7: Deep Learning Model (Starter Code)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_model(input_dim):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu'),  # ReLU activation function for hidden layer
        Dropout(0.2),  # Dropout for regularization
        Dense(64, activation='relu'),  # Another hidden layer
        Dropout(0.2),
        Dense(1, activation='linear')  # Linear activation for a regression output
    ])
    # Compile the model with an optimizer and loss function
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

# Instantiate and train the model
model = build_model(X_train.shape[1])
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32)

# Step 8: Model Evaluation
y_pred = model.predict(X_test).flatten()
print("\nMean Absolute Error (MAE):")
print(mean_absolute_error(y_test, y_pred))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 143ms/step - loss: 8941.5947 - mean_absolute_error: 88.0950 - val_loss: 7557.6807 - val_mean_absolute_error: 85.6930
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 8457.0938 - mean_absolute_error: 86.7262 - val_loss: 7537.5928 - val_mean_absolute_error: 85.5794
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 8340.0156 - mean_absolute_error: 86.0946 - val_loss: 7517.6045 - val_mean_absolute_error: 85.4657
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 8153.7505 - mean_absolute_error: 84.8097 - val_loss: 7497.9717 - val_mean_absolute_error: 85.3534
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 8372.0420 - mean_absolute_error: 86.2722 - val_loss: 7478.6997 - val_mean_absolute_error: 85.2429
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [3]:
agg_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   recipe_code     100 non-null    int64  
 1   avg_stars       100 non-null    float64
 2   comment_count   100 non-null    int64  
 3   avg_best_score  100 non-null    float64
 4   popularity      100 non-null    float64
dtypes: float64(3), int64(2)
memory usage: 4.0 KB


In [4]:
#Dropping variables
agg_data.isnull().sum()

recipe_code       0
avg_stars         0
comment_count     0
avg_best_score    0
popularity        0
dtype: int64

In [5]:
agg_data

Unnamed: 0,recipe_code,avg_stars,comment_count,avg_best_score,popularity
0,386,4.290179,224,167.714286,102.887946
1,414,4.179641,167,198.958084,91.981437
2,957,4.303318,211,139.559242,93.363507
3,1063,4.493671,158,135.341772,76.715190
4,1081,4.141892,148,129.216216,72.314189
...,...,...,...,...,...
95,74724,4.224490,147,118.734694,69.959184
96,82745,4.369748,119,208.865546,79.657983
97,100276,4.317073,164,161.817073,83.721951
98,141947,4.570175,114,219.684211,80.421930


In [6]:
agg_data.describe()

Unnamed: 0,recipe_code,avg_stars,comment_count,avg_best_score,popularity
count,100.0,100.0,100.0,100.0,100.0
mean,24356.35,4.252851,181.82,156.193806,87.911186
std,27500.396712,0.32498,106.803641,33.641169,31.738382
min,386.0,2.614973,31.0,105.514019,48.309677
25%,7741.0,4.098373,128.0,131.082339,70.803335
50%,17568.0,4.31732,149.0,148.432197,81.864039
75%,33894.0,4.455943,191.75,178.319126,93.020289
max,191775.0,4.731343,725.0,253.015748,247.060828


In [7]:
# Define bin edges based on min, percentiles, and max
bin_edges = [agg_data['popularity'].min(), 70.803335, 93.020289, agg_data['popularity'].max()]
bin_labels = ['Low', 'Medium', 'High']

# Bin the popularity scores into Low, Medium, and High categories
agg_data['popularity_bin'] = pd.cut(agg_data['popularity'], bins=bin_edges, labels=bin_labels, include_lowest=True)

# Convert the bins to numeric for modeling (Low=0, Medium=1, High=2)
agg_data['popularity_bin'] = agg_data['popularity_bin'].map({'Low': 0, 'Medium': 1, 'High': 2})

# Print to verify the binning
print(agg_data[['popularity', 'popularity_bin']].head())


   popularity popularity_bin
0  102.887946              2
1   91.981437              1
2   93.363507              2
3   76.715190              1
4   72.314189              1


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report

# Define EarlyStopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,  # Stop training after 3 epochs of no improvement
    restore_best_weights=True  # Restore the best weights after training
)

# Convert the labels to one-hot encoding
y_train_encoded = to_categorical(y_train, num_classes=3)
y_test_encoded = to_categorical(y_test, num_classes=3)

# Define the RNN model for multi-class classification
def build_rnn_model(input_dim, input_length):
    model = Sequential([
        Embedding(input_dim=10000, output_dim=128, input_length=input_length),  # Embedding layer
        LSTM(64, return_sequences=False),  # LSTM layer
        Dropout(0.5),  # Dropout for regularization
        Dense(32, activation='relu'),  # Fully connected layer
        BatchNormalization(),  # Batch normalization
        Dense(3, activation='softmax')  # Output layer for multi-class classification
    ])
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='categorical_crossentropy',  # Multi-class classification loss
        metrics=['accuracy']  # Accuracy metric
    )
    return model

# Build the RNN model
input_length = X_train.shape[1]  # Number of input features
rnn_model = build_rnn_model(input_dim=10000, input_length=input_length)

# Train the model
history = rnn_model.fit(
    X_train, y_train_encoded,  # Ensure X_train is numerical and y_train_encoded is one-hot encoded
    validation_data=(X_test, y_test_encoded),  # Validation set
    epochs=10,  # Adjust as needed
    batch_size=32,  # Adjust based on dataset size
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the model
y_pred_probs = rnn_model.predict(X_test)
y_pred = y_pred_probs.argmax(axis=1)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Low', 'Medium', 'High']))


IndexError: index 129 is out of bounds for axis 1 with size 3

In [9]:
print(agg_data['popularity_bin'].value_counts())


popularity_bin
1    50
0    25
2    25
Name: count, dtype: int64


In [12]:
# Correct binning logic if needed
bin_edges = [agg_data['popularity'].min(), 70.803335, 93.020289, agg_data['popularity'].max()]
bin_labels = ['Low', 'Medium', 'High']
agg_data['popularity_bin'] = pd.cut(agg_data['popularity'], bins=bin_edges, labels=bin_labels, include_lowest=True)
agg_data['popularity_bin'] = agg_data['popularity_bin'].map({'Low': 0, 'Medium': 1, 'High': 2})


In [None]:
X = agg_data[['avg_stars', 'comment_count', 'avg_best_score']].values
y = agg_data['popularity_bin'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [14]:
y_train_encoded = to_categorical(y_train, num_classes=3)
y_test_encoded = to_categorical(y_test, num_classes=3)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report

# Define EarlyStopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,  # Stop training after 3 epochs of no improvement
    restore_best_weights=True  # Restore the best weights after training
)

# Convert the labels to one-hot encoding
y_train_encoded = to_categorical(y_train, num_classes=3)
y_test_encoded = to_categorical(y_test, num_classes=3)

# Define the RNN model for multi-class classification
def build_rnn_model(input_dim, input_length):
    model = Sequential([
        Embedding(input_dim=10000, output_dim=128, input_length=input_length),  # Embedding layer
        LSTM(64, return_sequences=False),  # LSTM layer
        Dropout(0.5),  # Dropout for regularization
        Dense(32, activation='relu'),  # Fully connected layer
        BatchNormalization(),  # Batch normalization
        Dense(3, activation='softmax')  # Output layer for multi-class classification
    ])
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='categorical_crossentropy',  # Multi-class classification loss
        metrics=['accuracy']  # Accuracy metric
    )
    return model

# Build the RNN model
input_length = X_train.shape[1]  # Number of input features
rnn_model = build_rnn_model(input_dim=10000, input_length=input_length)

# Train the model
history = rnn_model.fit(
    X_train, y_train_encoded,  # Ensure X_train is numerical and y_train_encoded is one-hot encoded
    validation_data=(X_test, y_test_encoded),  # Validation set
    epochs=10,  # Adjust as needed
    batch_size=32,  # Adjust based on dataset size
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the model
y_pred_probs = rnn_model.predict(X_test)
y_pred = y_pred_probs.argmax(axis=1)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Low', 'Medium', 'High']))




Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 162ms/step - accuracy: 0.3891 - loss: 1.1187 - val_accuracy: 0.5000 - val_loss: 1.0963
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.4547 - loss: 1.0427 - val_accuracy: 0.5000 - val_loss: 1.0934
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.5391 - loss: 0.9747 - val_accuracy: 0.5000 - val_loss: 1.0911
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.6828 - loss: 0.8894 - val_accuracy: 0.5000 - val_loss: 1.0885
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.7336 - loss: 0.8102 - val_accuracy: 0.5000 - val_loss: 1.0864
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.8461 - loss: 0.7287 - val_accuracy: 0.5000 - val_loss: 1.0841
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


: 