In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder


In [88]:
df = pd.read_csv('full_recipes.csv')

In [89]:
df.columns

Index(['RecipeId', 'Name', 'AuthorId', 'AuthorName', 'CookTime', 'PrepTime',
       'TotalTime', 'DatePublished', 'Description', 'RecipeCategory',
       'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions', 'Images',
       'text'],
      dtype='object')

In [90]:
df.shape

(522517, 29)

In [106]:
features = ['RecipeId', 'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent', 'SaturatedFatContent',
            'CholesterolContent', 'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent',
            'ProteinContent', 'RecipeServings', 'RecipeCategory']

recipes = df[features]

In [92]:
recipes.shape

(522517, 14)

In [93]:
merged_df = recipes.dropna(subset=['RecipeCategory'])


In [94]:
merged_df.shape

(521766, 14)

In [95]:
# One-hot encode using pandas
encoded_df = pd.get_dummies(merged_df['RecipeCategory'], drop_first=False)

# The result will already have 1 and 0 instead of True and False
print(encoded_df.head())


   < 15 Mins  < 30 Mins  < 4 Hours  < 60 Mins  African  Apple  Apple Pie  \
0      False      False      False      False    False  False      False   
1      False      False      False      False    False  False      False   
2      False      False      False      False    False  False      False   
3      False      False      False      False    False  False      False   
4      False      False      False      False    False  False      False   

   Artichoke  Asian  Australian  ...  Wheat Bread  White Rice  Whitefish  \
0      False  False       False  ...        False       False      False   
1      False  False       False  ...        False       False      False   
2      False  False       False  ...        False       False      False   
3      False  False       False  ...        False       False      False   
4      False  False       False  ...        False       False      False   

   Whole Chicken  Whole Duck  Whole Turkey  Wild Game  Winter  \
0          False     

In [96]:
encoded_df.columns

Index(['< 15 Mins', '< 30 Mins', '< 4 Hours', '< 60 Mins', 'African', 'Apple',
       'Apple Pie', 'Artichoke', 'Asian', 'Australian',
       ...
       'Wheat Bread', 'White Rice', 'Whitefish', 'Whole Chicken', 'Whole Duck',
       'Whole Turkey', 'Wild Game', 'Winter', 'Yam/Sweet Potato',
       'Yeast Breads'],
      dtype='object', length=311)

In [97]:
imputer = SimpleImputer(strategy='mean')
print(merged_df.shape)
imputed_values = imputer.fit_transform(merged_df[['AggregatedRating', 'ReviewCount', 'RecipeServings']])
print(merged_df.shape)
# Assign the imputed values back to the original columns in merged_df
merged_df[['AggregatedRating', 'ReviewCount', 'RecipeServings']] = imputed_values

(521766, 14)
(521766, 14)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[['AggregatedRating', 'ReviewCount', 'RecipeServings']] = imputed_values


In [98]:
merged_df.shape

(521766, 14)

In [100]:
# Drop the 'RecipeCategory' column from the original DataFrame
merged_df = merged_df.drop(columns=['RecipeCategory'])
merged_df = pd.concat([merged_df, encoded_df], axis=1)


In [101]:
merged_df.shape

(521766, 324)

In [102]:
fit, blindtest = train_test_split(merged_df, test_size=0.2, random_state=0)
fit_train, fit_test = train_test_split(fit, test_size=0.3, random_state=0)
features +=  df['RecipeCategory'].unique().tolist()
recipe_col = 'RecipeId'
target_col = 'AggregatedRating'

fit_train = fit_train.sort_values('RecipeId').reset_index(drop=True)
fit_test = fit_test.sort_values('RecipeId').reset_index(drop=True)
blindtest = blindtest.sort_values('RecipeId').reset_index(drop=True)

fit_train_query = fit_train[recipe_col].value_counts().sort_index()
fit_test_query = fit_test[recipe_col].value_counts().sort_index()
blindtest_query = blindtest[recipe_col].value_counts().sort_index()



In [103]:
fit.shape

(417412, 324)

In [None]:
fit_train_query = list(map(int, fit_train_query))
fit_test_query = list(map(int, fit_test_query))

fit_train[target_col] = fit_train[target_col].astype(int)
fit_test[target_col] = fit_test[target_col].astype(int)

model = lgb.LGBMRanker(n_estimators=1000, random_state=0)
model.fit(
    fit_train,
    fit_train[target_col],
    group=fit_train_query,
    eval_set=[(fit_test, fit_test[target_col])],
    eval_group=[list(fit_test_query)],
    eval_at=[1, 3, 5, 10], 
)
# Make predictions on the test set
y_pred = model.predict(fit_test)

# Evaluate the model using the metrics provided by LightGBM
eval_results = model.best_score_  # Get the evaluation results



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002225 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3361
[LightGBM] [Info] Number of data points in the train set: 292188, number of used features: 239


In [None]:
fit_test.shape

(125224, 325)

In [None]:
fit_test['PredictedRanking'] = y_pred
result_df = fit_test[['RecipeId', 'PredictedRanking']]

# Sort by predicted ranking
result_df_sorted = result_df.sort_values(by='PredictedRanking', ascending=True)

# Display the top 10 predictions
top_predictions = result_df_sorted.head(10)
print(top_predictions)


       RecipeId  PredictedRanking
0            39               0.0
83489    361898               0.0
83488    361895               0.0
83487    361894               0.0
83486    361893               0.0
83485    361882               0.0
83484    361881               0.0
83483    361877               0.0
83482    361874               0.0
83481    361872               0.0


In [None]:
test_subset = fit_test.sample(frac=0.5, random_state=0)
# Predict on the subset of the test set
test_features = fit.columns
y_pred = model.predict(test_subset[test_features])

# Add the predictions to the test subset
test_subset['PredictedRating'] = y_pred


In [None]:
test_subset.shape

(62612, 326)

In [None]:
len(test_subset.columns)

326

In [None]:
# Sort by predicted ratings and display the top results
test_subset_sorted = test_subset.sort_values(by='PredictedRating', ascending=False)

# Display top 10 rows with 'RecipeId_x', 'PredictedRating', and actual 'Rating'
print(test_subset_sorted[['RecipeId', 'PredictedRating', target_col]].head(10))


        RecipeId  PredictedRating  AggregatedRating
39220     171501              0.0                 4
33432     146195              0.0                 5
55473     241281              0.0                 5
114087    493382              0.0                 5
89101     385826              0.0                 5
112434    486109              0.0                 4
115141    497785              0.0                 4
72472     314860              0.0                 5
110310    477012              0.0                 4
72083     313306              0.0                 4


In [None]:
# List of RecipeId_x values to filter
recipe_ids = [381468,41429,124146,240830]

# Filter the DataFrame based on RecipeId_x
filtered_df = df[df['RecipeId'].isin(recipe_ids)]

# Print the filtered rows
print(filtered_df['Name'])


37672     Zucchini, Red Pepper & Leek Frittata
117964                              Tofu Tacos
231192                  Scallop and Bacon Toss
368088    Southwest Corn &amp; Green Chili Dip
Name: Name, dtype: object


In [None]:
import pickle

# Save the model to a file
with open("lgbm_ranker.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")


Model saved successfully!


In [None]:
top_predictions.to_csv('top_predictions.csv', index=False)


In [None]:
suggestion = pd.read_csv('resource/top_predictions.csv')
print(suggestion['RecipeId'])

0        39
1    361898
2    361895
3    361894
4    361893
5    361882
6    361881
7    361877
8    361874
9    361872
Name: RecipeId, dtype: int64


In [104]:
testing_data = pd.read_csv('for_model.csv')
print(testing_data.shape)

(2, 28)


In [107]:
features_data = testing_data[features]
print(features_data.shape)

(2, 14)


In [58]:
with open('lgbm_ranker.pkl', 'rb') as file:
    model = pickle.load(file)

In [None]:
features_data = features_data.drop(columns='RecipeCategory')

In [110]:
encoded_testing_data.columns

Index(['< 15 Mins', '< 30 Mins', '< 4 Hours', '< 60 Mins', 'African', 'Apple',
       'Apple Pie', 'Artichoke', 'Asian', 'Australian',
       ...
       'Wheat Bread', 'White Rice', 'Whitefish', 'Whole Chicken', 'Whole Duck',
       'Whole Turkey', 'Wild Game', 'Winter', 'Yam/Sweet Potato',
       'Yeast Breads'],
      dtype='object', length=311)

In [65]:
encoded_testing_data = pd.get_dummies(testing_data['RecipeCategory'], drop_first=False)
encoded_df, encoded_testing_data = encoded_df.align(encoded_testing_data, join='left', axis=1, fill_value=0)
print(encoded_testing_data.head())


   < 15 Mins  < 30 Mins  < 4 Hours  < 60 Mins  African  Apple  Apple Pie  \
0          0          0          0          0        0      0          0   
1          0          0          0          0        0      0          0   

   Artichoke  Asian  Australian  ...  Wheat Bread  White Rice  Whitefish  \
0          0      0           0  ...            0           0          0   
1          0      0           0  ...            0           0          0   

   Whole Chicken  Whole Duck  Whole Turkey  Wild Game  Winter  \
0              0           0             0          0       0   
1              0           0             0          0       0   

   Yam/Sweet Potato  Yeast Breads  
0                 0             0  
1                 0             0  

[2 rows x 311 columns]


In [66]:
test_subset.shape

(62612, 326)

In [108]:
imputed_values = imputer.fit_transform(encoded_testing_data[['AggregatedRating', 'ReviewCount', 'RecipeServings']])
encoded_testing_data[['AggregatedRating', 'ReviewCount', 'RecipeServings']] = imputed_values

KeyError: "None of [Index(['AggregatedRating', 'ReviewCount', 'RecipeServings'], dtype='object')] are in the [columns]"