In [1]:
import numpy
import ast 
import urllib
import scipy.optimize
import csv
import random
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import gzip
from collections import defaultdict

In [2]:
dataset = []
with open("RAW_interactions.csv", encoding="utf8") as f:
    next(f)  
    for l in f:
        fields = l.strip().split(",")  # Remove trailing newline and split by commas
        dataset.append(fields)

In [3]:
dataset[0]
#user_id, recipe_id, date, rating, review

['38094',
 '40893',
 '2003-02-17',
 '4',
 'Great with a salad. Cooked on top of stove for 15 minutes.Added a shake of cayenne and a pinch of salt.  Used low fat sour cream.  Thanks.']

In [4]:
recipe_ratings = {}

for row in dataset:
    # Validate that row has enough fields and fields are not empty
    if len(row) >= 4 and row[1].strip() and row[3].strip():
        recipe_id = row[1].strip()  # Clean whitespace
        rating = row[3].strip()
        
        # Additional check: Ensure 'rating' is numeric, if applicable
        if rating.isdigit():
            # Add rating to the dictionary
            if recipe_id not in recipe_ratings:
                recipe_ratings[recipe_id] = []
            recipe_ratings[recipe_id].append(int(rating))

# Example output
#for recipe_id, ratings in recipe_ratings.items():
#    print(f"Recipe ID: {recipe_id}, Ratings: {ratings}")

In [5]:
dataset_features = []
with open("RAW_recipes.csv", encoding="utf8") as f:
    next(f)  
    for l in f:
        fields = l.strip().split(",")  # Remove trailing newline and split by commas
        dataset_features.append(fields)
#name, id, minutes, contributor_id, date, tags, nutrition, number of steps, steps, description, ingredients, n_ingredinets

In [6]:
recipe_nutrition = {}
'''
with open("RAW_recipes.csv", encoding="utf8") as f:
    reader = csv.reader(f)
    next(reader)  # Skip the header

    for row in reader:
        if len(row) >= 7:  # Ensure the row has enough fields
            recipe_id = row[1].strip()  # Recipe ID (assumed second column)
            nutrition_data = row[6].strip()  # Nutrition field (assumed seventh column)

            try:
                # Safely parse the nutrition field
                nutrition_list = ast.literal_eval(nutrition_data)

                # Ensure the parsed data is a list and has at least 7 elements
                if isinstance(nutrition_list, list) and len(nutrition_list) >= 7:
                    total_fat = nutrition_list[1]  # Total fat (PDV)
                    sugar = nutrition_list[2]      # Sugar (PDV)
                    saturated_fat = nutrition_list[5]  # Saturated fat (PDV)

                    # Validate that the values are non-negative numbers
                    if all(isinstance(x, (int, float)) and x >= 0 for x in [total_fat, sugar, saturated_fat]):
                        recipe_nutrition[recipe_id] = {
                            "total_fat": total_fat,
                            "sugar": sugar,
                            "saturated_fat": saturated_fat
                        }
            except (ValueError, SyntaxError, TypeError):
                # Skip rows with invalid nutrition data
                continue
'''
with open("RAW_recipes.csv", encoding="utf8") as f:
    reader = csv.reader(f)
    next(reader)  # Skip the header

    for row in reader:
        if len(row) >= 12:  # Ensure the row has enough fields
            recipe_id = row[1].strip()  # Recipe ID (assumed second column)
            n_ingredients = row[11].strip()  # Number of ingredients field (assumed 12th column)

            # Validate and store the number of ingredients
            if n_ingredients.isdigit():  # Check if it's a valid number
                recipe_nutrition[recipe_id] = {
                    "n_ingredients": int(n_ingredients)
                }
# Example output
#for recipe_id, nutrition in recipe_nutrition.items():
#   print(f"Recipe ID: {recipe_id}, Nutrition: {nutrition}")
#extract total_fat, sugar, and saturated_fat

In [7]:
all_ids = list(recipe_nutrition.keys())
random.shuffle(all_ids)

# Calculate the split index (50/50)
split_index = len(all_ids) // 2

# Split keys into training and testing sets
train_ids = all_ids[:split_index]
test_ids = all_ids[split_index:]

# Create train and test dictionaries
train_set = {recipe_id: recipe_nutrition[recipe_id] for recipe_id in train_ids}
test_set = {recipe_id: recipe_nutrition[recipe_id] for recipe_id in test_ids}

print(f"Train set size: {len(train_set)}")
print(f"Test set size: {len(test_set)}")

Train set size: 115818
Test set size: 115819


In [8]:
def feature(datum):
    feat = [1]  # Bias term
    #feat.append(datum['total_fat'])      
    #feat.append(datum['sugar'])          
    #feat.append(datum['saturated_fat'])   
    feat.append(datum['n_ingredients'])   
    return feat

X_train = numpy.asarray([feature(train_set[recipe_id]) for recipe_id in train_set])
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

Y_train = numpy.asarray([
    numpy.mean(recipe_ratings[recipe_id]) if isinstance(recipe_ratings[recipe_id], list) else float(recipe_ratings[recipe_id])
    for recipe_id in train_set
])

model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X_train, Y_train)

print("Model coefficients:", model.coef_)

Model coefficients: [ 4.37044222e+00 -2.93794302e-03]


In [9]:
X_test = numpy.asarray([feature(test_set[recipe_id]) for recipe_id in test_set])
test_recipe_ids = list(test_set.keys())
X_test_scaled = scaler.transform(X_test)

Y_test = numpy.asarray([
    numpy.mean(recipe_ratings[recipe_id]) if isinstance(recipe_ratings[recipe_id], list) else float(recipe_ratings[recipe_id])
    for recipe_id in test_recipe_ids
])

# Predict using the model
Y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(Y_test, Y_pred)
print(f"Mean Squared Error: {mse}")

def balanced_error_rate(y_true, y_pred):
    epsilon = 1e-10
    relative_errors = numpy.abs((y_true - y_pred) / y_true+epsilon)
    return numpy.mean(relative_errors)

tolerance = 0.25

# Calculate the number of correct predictions
correct_predictions = numpy.sum(numpy.abs(Y_pred - Y_test) <= tolerance)

# Calculate percent correct
percent_correct = (correct_predictions / len(Y_test)) * 100
print(f"Percent Correct: {percent_correct:.2f}%")


Mean Squared Error: 0.9798872545545332
Percent Correct: 14.47%
