In [149]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [150]:
df = pd.read_csv('full_recipes.csv')
review = pd.read_csv('resource/reviews.csv')

KeyboardInterrupt: 

In [209]:
features = ['RecipeId', 'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent', 'SaturatedFatContent',
            'CholesterolContent', 'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent',
            'ProteinContent', 'RecipeServings', 'RecipeCategory']

recipes = df[features]
merged_df = recipes.merge(review, left_on='RecipeId', right_on='ReviewId', how='inner')


In [210]:
review.columns

Index(['ReviewId', 'RecipeId', 'AuthorId', 'AuthorName', 'Rating', 'Review',
       'DateSubmitted', 'DateModified'],
      dtype='object')

In [211]:
merge_df = merged_df.dropna(subset=['RecipeCategory'])


In [212]:
merged_df.drop(columns=['AuthorName','AuthorId','RecipeId_y','DateSubmitted','DateModified','Review'], inplace=True)

In [213]:
# One-hot encode using pandas
encoded_df = pd.get_dummies(merged_df['RecipeCategory'], drop_first=False)

# The result will already have 1 and 0 instead of True and False
print(encoded_df.head())


   < 15 Mins  < 30 Mins  < 4 Hours  < 60 Mins  African  Apple  Apple Pie  \
0      False      False      False      False    False  False      False   
1      False      False      False      False    False  False      False   
2      False      False      False      False    False  False      False   
3      False      False      False      False    False  False      False   
4      False      False      False      False    False  False      False   

   Artichoke  Asian  Australian  ...  Wheat Bread  White Rice  Whitefish  \
0      False  False       False  ...        False       False      False   
1      False  False       False  ...        False       False      False   
2      False  False       False  ...        False       False      False   
3      False  False       False  ...        False       False      False   
4      False  False       False  ...        False       False      False   

   Whole Chicken  Whole Duck  Whole Turkey  Wild Game  Winter  \
0          False     

In [214]:
encoded_df.columns

Index(['< 15 Mins', '< 30 Mins', '< 4 Hours', '< 60 Mins', 'African', 'Apple',
       'Apple Pie', 'Artichoke', 'Asian', 'Australian',
       ...
       'Wheat Bread', 'White Rice', 'Whitefish', 'Whole Chicken', 'Whole Duck',
       'Whole Turkey', 'Wild Game', 'Winter', 'Yam/Sweet Potato',
       'Yeast Breads'],
      dtype='object', length=308)

In [215]:
imputer = SimpleImputer(strategy='mean')
# Apply SimpleImputer only on the selected columns
imputed_values = imputer.fit_transform(merged_df[['AggregatedRating', 'ReviewCount', 'RecipeServings']])

# Assign the imputed values back to the original columns in merged_df
merged_df[['AggregatedRating', 'ReviewCount', 'RecipeServings']] = imputed_values

In [None]:
temp_features = merged_df['RecipeCategory']

In [None]:
# merged_df.dropna(subset=['RecipeCategory'], inplace=True)


RecipeId_x               0
AggregatedRating         0
ReviewCount              0
Calories                 0
FatContent               0
SaturatedFatContent      0
CholesterolContent       0
SodiumContent            0
CarbohydrateContent      0
FiberContent             0
SugarContent             0
ProteinContent           0
RecipeServings           0
RecipeCategory         688
ReviewId                 0
Rating                   0
dtype: int64


In [218]:
# Drop the 'RecipeCategory' column from the original DataFrame
merged_df = merged_df.drop(columns=['RecipeCategory'])
merged_df = pd.concat([merged_df, encoded_df], axis=1)


In [224]:
fit, blindtest = train_test_split(merged_df, test_size=0.2, random_state=0)
fit_train, fit_test = train_test_split(fit, test_size=0.3, random_state=0)
features +=  df['RecipeCategory'].unique().tolist()
review_col = 'ReviewId'
recipe_col = 'RecipeId_x'
target_col = 'Rating'

fit_train = fit_train.sort_values('ReviewId').reset_index(drop=True)
fit_test = fit_test.sort_values('ReviewId').reset_index(drop=True)
blindtest = blindtest.sort_values('ReviewId').reset_index(drop=True)

fit_train_query = fit_train[review_col].value_counts().sort_index()
fit_test_query = fit_test[review_col].value_counts().sort_index()
blindtest_query = blindtest[review_col].value_counts().sort_index()



In [None]:
model = lgb.LGBMRanker(n_estimators=1000, random_state=0)
model.fit(
    fit_train,
    fit_train[target_col],
    group=fit_train_query,
    eval_set=[(fit_test, fit_test[target_col])],
    eval_group=[list(fit_test_query)],
    eval_at=[1, 3, 5, 10], 
)
# Make predictions on the test set
y_pred = model.predict(fit_test)

# Evaluate the model using the metrics provided by LightGBM
eval_results = model.best_score_  # Get the evaluation results



Evaluation Results:
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict({'ndcg@1': 1.0, 'ndcg@3': 1.0, 'ndcg@5': 1.0, 'ndcg@10': 1.0})})
Best NDCG@1: 1.0
Best NDCG@3: 1.0
Best NDCG@5: 1.0
Best NDCG@10: 1.0


In [234]:
fit_test['PredictedRanking'] = y_pred
result_df = fit_test[['RecipeId_x', 'PredictedRanking']]

# Sort by predicted ranking
result_df_sorted = result_df.sort_values(by='PredictedRanking', ascending=True)

# Display the top 10 predictions
top_predictions = result_df_sorted.head(10)
print(top_predictions)


       RecipeId_x  PredictedRanking
0              42               0.0
77223      356048               0.0
77222      356045               0.0
77221      356044               0.0
77220      356043               0.0
77219      356032               0.0
77218      356030               0.0
77217      356026               0.0
77216      356014               0.0
77215      356012               0.0


In [238]:
test_subset = fit_test.sample(frac=0.5, random_state=0)
# Predict on the subset of the test set
test_features = fit.columns
y_pred = model.predict(test_subset[test_features])

# Add the predictions to the test subset
test_subset['PredictedRating'] = y_pred


In [240]:
# Sort by predicted ratings and display the top results
test_subset_sorted = test_subset.sort_values(by='PredictedRating', ascending=False)

# Display top 10 rows with 'RecipeId_x', 'PredictedRating', and actual 'Rating'
print(test_subset_sorted[['RecipeId_x', 'PredictedRating', target_col]].head(10))


       RecipeId_x  PredictedRating  Rating
82850      381468              0.0       4
8086        41429              0.0       5
26263      124146              0.0       4
15285       74590              0.0       5
51983      240830              0.0       4
58261      270029              0.0       5
70534      325871              0.0       5
96075      446199              0.0       5
41930      194965              0.0       4
64032      295741              0.0       5


In [244]:
# List of RecipeId_x values to filter
recipe_ids = [240830]

# Filter the DataFrame based on RecipeId_x
filtered_df = df[df['RecipeId'].isin(recipe_ids)]

# Print the filtered rows
print(filtered_df)


        RecipeId                    Name  AuthorId AuthorName CookTime  \
231192    240830  Scallop and Bacon Toss    137911   Pam-I-Am    PT15M   

       PrepTime TotalTime         DatePublished  \
231192    PT15M     PT30M  2007-07-16T22:30:00Z   

                                              Description RecipeCategory  ...  \
231192  Make and share this Scallop and Bacon Toss rec...  One Dish Meal  ...   

       SodiumContent CarbohydrateContent FiberContent  SugarContent  \
231192         222.7                32.2          2.0           1.9   

        ProteinContent  RecipeServings  RecipeYield  \
231192            16.8             6.0          NaN   

                                       RecipeInstructions  \
231192  c("Cook vermicelli as package directs drain.",...   

                                                   Images  \
231192  c("https://img.sndimg.com/food/image/upload/w_...   

                                                     text  
231192  Scallop and Bacon