In [224]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import pickle

In [None]:
data  = pd.read_csv('full_recipes.csv')
test_data = pd.read_csv('for_model.csv')

In [None]:
type(data)

pandas.core.frame.DataFrame

In [None]:
type(test_data)

pandas.core.frame.DataFrame

In [None]:
print(f'data shape: {data.shape} , test_data { test_data.shape}')

data shape: (522517, 29) , test_data (2, 28)


In [None]:
features = ['RecipeId', 'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent', 'SaturatedFatContent',
            'CholesterolContent', 'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent',
            'ProteinContent', 'RecipeServings', 'RecipeCategory']
data = data[features]
test_data = test_data[features]


In [None]:
encoder = LabelEncoder()

data['RecipeCategory']= encoder.fit_transform(data['RecipeCategory'])
test_data['RecipeCategory'] = encoder.transform(test_data['RecipeCategory'])

# encoded_data.drop(columns='RecipeCategory',inplace=True)
# encoded_test_data.drop(columns='RecipeCategory',inplace=True)

In [None]:
with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

In [None]:
print(f'data shape: {data.shape} , test_data { test_data.shape}')

data shape: (522517, 14) , test_data (2, 14)


In [None]:
null_col = ['AggregatedRating', 'ReviewCount', 'RecipeServings']

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the data
data_imputed = pd.DataFrame(imputer.fit_transform(data[null_col]), columns=null_col, index=data.index)
test_data_imputed = pd.DataFrame(imputer.transform(test_data[null_col]), columns=null_col, index=test_data.index)

# Replace original columns with imputed values
data[null_col] = data_imputed
test_data[null_col] = test_data_imputed

data = data.drop(columns=['RecipeCategory'])
test_data = test_data.drop(columns=['RecipeCategory'])

In [None]:

with open("imputer.pkl", "wb") as f:
    pickle.dump(imputer, f)

In [None]:
print(f'data shape: {data.shape} , test_data { test_data.shape}')

data shape: (522517, 13) , test_data (2, 13)


In [None]:
fit, blindtest = train_test_split(data, test_size=0.2, random_state=0)
fit_train, fit_test = train_test_split(fit, test_size=0.3, random_state=0)
recipe_col = 'RecipeId'
target_col = 'AggregatedRating'

fit_train = fit_train.sort_values('RecipeId').reset_index(drop=True)
fit_test = fit_test.sort_values('RecipeId').reset_index(drop=True)
blindtest = blindtest.sort_values('RecipeId').reset_index(drop=True)

fit_train_query = fit_train[recipe_col].value_counts().sort_index()
fit_test_query = fit_test[recipe_col].value_counts().sort_index()
blindtest_query = blindtest[recipe_col].value_counts().sort_index()

fit_train_query = list(map(int, fit_train_query))
fit_test_query = list(map(int, fit_test_query))

fit_train[target_col] = fit_train[target_col].astype(int)
fit_test[target_col] = fit_test[target_col].astype(int)


In [None]:

model = lgb.LGBMRanker(n_estimators=1000, random_state=0)
model.fit(
    fit_train,
    fit_train[target_col],
    group=fit_train_query,
    eval_set=[(fit_test, fit_test[target_col])],
    eval_group=[list(fit_test_query)],
    eval_at=[1, 3, 5, 10], 
)

y_pred = model.predict(fit_test)

# Evaluate the model using the metrics provided by LightGBM
eval_results = model.best_score_  # Get the evaluation results



LightGBMError: label should be int type (met 4.500000) for ranking task,
for the gain of label, please set the label_gain parameter

In [None]:
# Predict ratings on the test set
predictions = model.predict(fit_test)

# Add predictions to the test set
fit_test['predicted_rating'] = predictions

# Sort the test set by the predicted ratings in descending order
sorted_test = fit_test.sort_values('predicted_rating', ascending=False)

# Get the top `RecipeId` with the highest predicted ratings
top_recipe_id = sorted_test['RecipeId'].iloc[0]  # This gives the `RecipeId` with the highest rating

print(f"The RecipeId with the highest predicted rating is: {top_recipe_id}")


NotFittedError: Estimator not fitted, call fit before exploiting the model.

In [None]:
import pickle
# Save the model to a file
with open("new_lgbm_ranker.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")

Model saved successfully!


In [None]:
fit_test['PredictedRanking'] = y_pred
result_df = fit_test[['RecipeId', 'PredictedRanking']]

# Sort by predicted ranking
result_df_sorted = result_df.sort_values(by='PredictedRanking', ascending=True)

# Display the top 10 predictions
top_predictions = result_df_sorted.head(10)
print(top_predictions)


       RecipeId  PredictedRanking
0            47               0.0
83610    361957               0.0
83609    361953               0.0
83608    361952               0.0
83607    361950               0.0
83606    361942               0.0
83605    361939               0.0
83604    361937               0.0
83603    361936               0.0
83602    361934               0.0


In [None]:
fit_test['PredictedRanking'] = y_pred

In [None]:
test_subset = fit_test.sample(frac=0.5, random_state=0)
# Predict on the subset of the test set
test_features = fit.columns
y_pred = model.predict(test_subset[test_features])

# Add the predictions to the test subset
test_subset['PredictedRating'] = y_pred
# Sort by predicted ratings and display the top results
test_subset_sorted = test_subset.sort_values(by='PredictedRating', ascending=False)

# Display top 10 rows with 'RecipeId_x', 'PredictedRating', and actual 'Rating'
print(test_subset_sorted[['RecipeId', 'PredictedRating', target_col]].head(10))


        RecipeId  PredictedRating  AggregatedRating
120998    522378              0.0                 4
118148    510258              0.0                 4
107415    463991              0.0                 4
1178        7731              0.0                 4
48873     212042              0.0                 4
114242    493621              0.0                 4
96729     418111              0.0                 4
82709     357876              0.0                 4
46333     201087              0.0                 4
96304     416308              0.0                 5


In [None]:
import pandas as pd
import pickle
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

def load_pickle(file_name):
    """Load a pickled object if it exists, otherwise return None."""
    if os.path.exists(file_name):
        with open(file_name, "rb") as f:
            return pickle.load(f)
    return None

def save_pickle(obj, file_name):
    """Save an object to a pickle file."""
    with open(file_name, "wb") as f:
        pickle.dump(obj, f)

def preprocess_data(df, fit_encoder=True, fit_imputer=True):
    features = ['RecipeId', 'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent', 'SaturatedFatContent',
                'CholesterolContent', 'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent',
                'ProteinContent', 'RecipeServings', 'RecipeCategory']
    
    df = df[features].copy()  # Ensure a copy to avoid modifying the original DataFrame
    
    # Load or create LabelEncoder
    encoder = load_pickle("encoder.pkl") if not fit_encoder else LabelEncoder()
    
    # Encode 'RecipeCategory'
    if fit_encoder:
        df['RecipeCategory'] = encoder.fit_transform(df['RecipeCategory'])
        save_pickle(encoder, "encoder.pkl")  # Save the fitted encoder
    else:
        df['RecipeCategory'] = encoder.transform(df['RecipeCategory'])
    
    # Select columns for imputation
    null_col = ['AggregatedRating', 'ReviewCount', 'RecipeServings']
    
    # Load or create SimpleImputer
    imputer = load_pickle("imputer.pkl") if not fit_imputer else SimpleImputer(strategy='mean')
    
    # Fit and transform for training, only transform for test data
    if fit_imputer:
        imputed_values = imputer.fit_transform(df[null_col])
        save_pickle(imputer, "imputer.pkl")  # Save the fitted imputer
    else:
        imputed_values = imputer.transform(df[null_col])
    
    df[null_col] = pd.DataFrame(imputed_values, columns=null_col, index=df.index)
    
    # Drop 'RecipeCategory'
    df = df.drop(columns=['RecipeCategory'])
    
    return df

test_data_2 = pd.read_csv('for_model.csv')
# Test data (using saved encoder and imputer)
test_data = preprocess_data(test_data_2, fit_encoder=False, fit_imputer=False)


In [None]:
y_pred = model.predict(test_data)

In [None]:
# Predict ratings on the test set
predictions = model.predict(y_pred)

# Add predictions to the test set
fit_test['predicted_rating'] = predictions

# Sort the test set by the predicted ratings in descending order
sorted_test = fit_test.sort_values('predicted_rating', ascending=False)

# Get the top `RecipeId` with the highest predicted ratings
top_recipe_id = sorted_test['RecipeId'].iloc[0]  # This gives the `RecipeId` with the highest rating

print(f"The RecipeId with the highest predicted rating is: {top_recipe_id}")


NotFittedError: Estimator not fitted, call fit before exploiting the model.