In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Load basic libraries**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

# **Load the datasets**

In [None]:
train_dataset = pd.read_csv("/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv")
test_dataset = pd.read_csv("/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv")
sample_dataset = pd.read_csv("/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/sample.csv")

# **Data samples and statistics**

In [None]:
train_dataset.duplicated().sum(), test_dataset.duplicated().sum()

In [None]:
train_dataset.shape, test_dataset.shape, sample_dataset.shape

In [None]:
train_dataset.head()

In [None]:
test_dataset.head()

In [None]:
sample_dataset.head()

In [None]:
train_dataset.describe()

In [None]:
test_dataset.describe()

**Correlation graph between 'RecipeNumber' and 'ThumbsUpCount'**
* Here is a scatter plot between 'RecipeNumber' and 'ThumbsUpCount', with 'RecipeNumber' on the x-axis and 'ThumbsUpCount' on the y-axis. The scatter plot will also include histograms of the individual distributions along the axes.

In [None]:
import seaborn as sns
from scipy.stats import pearsonr

# Create a jointplot
sns.jointplot(x='RecipeNumber', y='ThumbsUpCount', data=train_dataset, kind='scatter', color='skyblue')

# Calculate the Pearson correlation coefficient
corr_coefficient, _ = pearsonr(train_dataset['RecipeNumber'], train_dataset['ThumbsUpCount'])

# Annotate the plot with the Pearson correlation coefficient
plt.annotate(f'Pearson Correlation: {corr_coefficient:.2f}', xy=(0.5, 0.9), xycoords='axes fraction', ha='center', fontsize=12)

# Show the plot
plt.show()


**Correlation graph between 'RecipeNumber' and 'ThumbsDownCount'**
* Here is a scatter plot between 'RecipeNumber' and 'ThumbsDownCount', with 'RecipeNumber' on the x-axis and 'ThumbsDownCount' on the y-axis. The scatter plot will also include histograms of the individual distributions along the axes.

In [None]:
import seaborn as sns
from scipy.stats import pearsonr

# Create a jointplot
sns.jointplot(x='RecipeNumber', y='ThumbsDownCount', data=train_dataset, kind='scatter', color='skyblue')

# Calculate the Pearson correlation coefficient
corr_coefficient, _ = pearsonr(train_dataset['RecipeNumber'], train_dataset['ThumbsDownCount'])

# Annotate the plot with the Pearson correlation coefficient
plt.annotate(f'Pearson Correlation: {corr_coefficient:.2f}', xy=(0.5, 0.9), xycoords='axes fraction', ha='center', fontsize=12)

# Show the plot
plt.show()


In [None]:
train_dataset.info()

In [None]:
test_dataset.info()

# **Feature engineering/extraction**

In [None]:
train_dataset.isna().any()

In [None]:
train_dataset.isna().sum()

Here we assume that there is a direct mapping between Rating and Recipe_Review, and therefore we fill the empty Recipe_Review fields with the Review sentiments mapped to ratings in the rating_mapping dictionary.

In [None]:
def fill_null_reviews(train_dataset):

  rating_mapping = {5: 'GREAT',4:'GOOD',2:'BORING',0:'VERY BAD', 1: 'BAD', 3: 'OK'}
  train_dataset.loc[train_dataset['Recipe_Review'].isna(), 'Recipe_Review'] = train_dataset['Rating'].map(rating_mapping)
  return train_dataset

fill_null_reviews(train_dataset)

In [None]:
train_dataset.isna().sum()

In [None]:
test_dataset.isna().sum()

In [None]:
train_dataset.isin(['?']).sum()

In [None]:
test_dataset.isin(['?']).sum()

In [None]:
train_dataset['UserReputation'].value_counts()

From the box plot we can see that most of the users have a UserReputation around 0-100.

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(x='UserReputation', data=train_dataset)
plt.xlabel('User Reputation')
plt.ylabel('count')
plt.title('Box plot of User Reputation')
plt.show()

In [None]:
train_dataset['Rating'].value_counts()

From the bar graph we can see that most recipes (10371 recipes) have received a Rating of 5, followed by 0 and then 4.

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(x='Rating', data=train_dataset, palette='coolwarm')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Distribution of Ratings')
plt.show()

In [None]:
train_dataset['BestScore'].value_counts()

From the histogram we can see that most recipes (10506 recipes) have received the BestScore of 100, followed by a BestScore of 193. 

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(train_dataset['BestScore'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Best Score')
plt.ylabel('Frequency')
plt.title('Histogram of Best Score')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

Calculating the correlation matrix between the selected columns of the DataFrame train_dataset and then visualizing the correlations using a heatmap. 

The highest correlation is between BestScore and ThumbsUpCount with a correlation coefficient of 0.69.

Whereas the lowest correlation is between ID and RecipeNumber with a correlation coefficient of -0.49.

In [None]:
correlation = train_dataset[['ID', 'RecipeNumber','RecipeCode', 'UserReputation', 'CreationTimestamp', 'ReplyCount',
       'ThumbsUpCount', 'ThumbsDownCount', 'Rating', 'BestScore']].corr()
plt.figure(figsize=(12,7))
sns.heatmap(correlation,annot=True)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
cat_features = train_dataset.select_dtypes(['object']).columns
num_features = train_dataset.select_dtypes(['int','float']).columns

In [None]:
cat_features

In [None]:
num_features

**Plotting all numerical attributes with histogram plot for quick examination.**
* Features are at different scales.
* Features have different distributions -
 * A few are tail heavy. e.g. ID, BestScore
 * A few have a single mode. e.g. UserReputation, CreationTimestamp, ReplyCount, ThumbsUpCount, ThumbsDownCount

In [None]:
plot_data = train_dataset.drop(columns=['RecipeName', 'CommentID', 'UserID','UserName', 'Recipe_Review','Rating'])
# Create subplots for histograms
fig, axes = plt.subplots(3, 3, figsize=(16, 16))  # 3 rows and 3 columns for 9 features

# Loop through the columns and create histograms
for i, column in enumerate(plot_data.columns):
    row, col = divmod(i, 3)
    ax = axes[row, col]
    ax.hist(plot_data[column], bins=20, color='skyblue', alpha=0.7)
    ax.set_title(column)
    ax.set_xlabel(column)
    ax.set_ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# Creating a preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
# Creating a CT preprocessor
CT = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)])

In [None]:
# Fit and transform the data and convert it into a dataframe
train_dataset_transformed = pd.DataFrame(CT.fit_transform(train_dataset))

In [None]:
train_dataset_transformed.head()

In [None]:
# import re
# from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Define function to clean and preprocess text
# def preprocess_text(text):
#     # Convert text to lowercase
#     text = text.lower()
    
#     # Remove HTML tags
#     text = re.sub('<[^<]+?>', '', text)
    
#     # Remove non-alphanumeric characters and extra whitespaces
#     text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
#     # Remove stopwords
#     stop_words = set(ENGLISH_STOP_WORDS)
#     tokens = text.split()
#     tokens = [token for token in tokens if token not in stop_words]
#     text = ' '.join(tokens)
    
#     return text

# # Apply preprocessing to the Recipe_Review column
# train_dataset['Recipe_Review'] = train_dataset['Recipe_Review'].apply(preprocess_text)


In [None]:
X = train_dataset.drop(columns='Rating', axis=1)
y = train_dataset['Rating']
X.head()

In [None]:
y

TF-IDF(Term Frequency-Inverse Document Frequency) Vectorizer and Count Vectorizer are popular techniques used in natural language processing (NLP) to convert text data into numerical representations suitable for machine learning algorithms.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

Here, we initialize a CountVectorizer object named vec_username to convert the 'UserName' column into a matrix of token counts. It analyzes the text data to determine the unique words (tokens) present in the 'UserName' column and assigns an index to each unique word.

In [None]:
vec_username = CountVectorizer()
vec_username.fit(train_dataset['UserName'].values)

X_train_username = vec_username.transform(train_dataset['UserName'].values)
X_test_username = vec_username.transform(test_dataset['UserName'].values)

print(X_train_username.shape, y.shape)
print(X_test_username.shape)

This code is similar to the previous one but applied to the 'RecipeName' column instead of the 'UserName' column. It analyzes the text data to determine the unique words (tokens) present in the 'RecipeName' column and assigns an index to each unique word.

In [None]:
vec = CountVectorizer()
vec.fit(train_dataset['RecipeName'].values)

X_train_recipe = vec.transform(train_dataset['RecipeName'].values)
X_test_recipe = vec.transform(test_dataset['RecipeName'].values)

print(X_train_recipe.shape, y.shape)
print(X_test_recipe.shape)

Here we transform the 'Recipe_Review' column of the training dataset into a matrix of TF-IDF features ie. words that appear most frequently and are most important in the document. 

This code prepares the text data in the 'Recipe_Review' column for machine learning models by converting it into a numerical representation using TF-IDF features, which can then be used for training and testing classification models.

In [None]:
vec_rr = TfidfVectorizer(min_df=5,ngram_range=(1,4), max_features=10000 )
vec_rr.fit(train_dataset['Recipe_Review'].values)

X_train_rr = vec_rr.transform(train_dataset['Recipe_Review'].values)
X_test_rr = vec_rr.transform(test_dataset['Recipe_Review'].values)

print(X_train_rr.shape, y.shape)
print(X_test_rr.shape)

In [None]:
# X_train_rr.columns

In [None]:
# from sklearn.feature_selection import SelectKBest, chi2

# # Create SelectKBest object
# selector = SelectKBest(score_func=chi2, k=800)

# # Fit selector to training data
# selector.fit(X_train_rr, y)

# # Transform training and testing data
# X_train_rr = selector.transform(X_train_rr)
# X_test_rr = selector.transform(X_test_rr)

# print(X_train_rr.shape, y.shape)
# print(X_test_rr.shape)

Horizontally stacking sparse matrices X_train_username, X_train_recipe, and X_train_rr into a single sparse matrix X_tr for the training data.

Similarly stacking X_test_username, X_test_recipe, and X_test_rr into X_te.

In [None]:
from scipy.sparse import hstack

X_tr = hstack((X_train_username, X_train_recipe, X_train_rr))
X_te = hstack((X_test_username, X_test_recipe, X_test_rr))

print(X_tr.shape, y.shape)
print(X_te.shape)

In [None]:
X_tr, X_te

# **Dummy Classifier**
**Score: 0.76066**

In [None]:
# df = pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')
# X = df.drop("Rating", axis = 1)
# y = df['Rating']

# from sklearn.dummy import DummyClassifier
# model = DummyClassifier(strategy = "most_frequent").fit(X,y)

# X_test = pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv')
# y_pred = model.predict(X_test)

# submission = pd.DataFrame({ 'ID': range(1,4547),
#                             'total_amount': y_pred})

# submission.to_csv('submission.csv', index = False) # converting it to csv file

# **Random Forrest Classifier**
**Score: 0.77298**

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# r=RandomForestClassifier()
# r.fit(X_tr, y)

In [None]:
# y_pred = r.predict(X_tr)
# y_test_pred=r.predict(X_te)

# **CART: Decision Tree Classifier**
**Score: 0.76022**

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import GridSearchCV

# # Create Decision Tree classifier
# cart = DecisionTreeClassifier()

# # Define hyperparameters and their values to tune
# param_grid = {
#     'criterion': ['gini', 'entropy'],    # Splitting criterion
#     'max_depth': [None, 10, 20, 30],     # Maximum depth of the tree
#     'min_samples_split': [2, 5, 10],     # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4],       # Minimum number of samples required to be at a leaf node
#     'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider when looking for the best split
# }

# # Perform grid search cross-validation
# grid_search = GridSearchCV(estimator=cart, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# # Fit the grid search to the training data
# grid_search.fit(X_tr, y)

# # Print the best hyperparameters found
# print("Best hyperparameters:", grid_search.best_params_)

# # Use the best model found by GridSearchCV
# best_cart = grid_search.best_estimator_

# # Make predictions on the training and testing sets using the best model
# y_pred_train = best_cart.predict(X_tr)
# y_pred_test = best_cart.predict(X_te)


# **Bagging**
**Score: 0.77562**

In [None]:
# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import uniform, randint

# # Create base classifier (Decision Tree in this case)
# base_classifier = DecisionTreeClassifier()

# # Create Bagging Classifier
# bagging_classifier = BaggingClassifier(base_estimator=base_classifier)

# # Define hyperparameter distributions
# param_dist = {
#     'n_estimators': randint(10, 100),        # Randomly sample number of estimators
#     'max_samples': uniform(0.5, 0.5),        # Uniform distribution for max samples
#     'max_features': uniform(0.5, 0.5),       # Uniform distribution for max features
#     'bootstrap': [True, False]
# }

# # Perform randomized search cross-validation
# random_search_bagging = RandomizedSearchCV(estimator=bagging_classifier, param_distributions=param_dist, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)

# # Fit the randomized search to the training data
# random_search_bagging.fit(X_tr, y)

# # Print the best hyperparameters found
# print("Best hyperparameters for Bagging:", random_search_bagging.best_params_)
# # Best hyperparameters for Bagging: {'bootstrap': True, 'max_features': 0.5102922471479012, 'max_samples': 0.9849549260809971, 'n_estimators': 39}

# # Use the best model found by RandomizedSearchCV
# best_bagging = random_search_bagging.best_estimator_

# # Make predictions on the training and testing sets using the best model
# y_pred_train = best_bagging.predict(X_tr)
# y_pred_test = best_bagging.predict(X_te)

# **Boosting**
**Score: 0.68741**

In [None]:
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import GridSearchCV

# # Create base classifier (Decision Tree in this case)
# base_classifier = DecisionTreeClassifier()

# # Create AdaBoost Classifier
# adaboost_classifier = AdaBoostClassifier(base_classifier)

# # Define hyperparameters and their values to tune
# param_grid = {
#     'n_estimators': [50, 100, 200],  # Number of weak learners
#     'learning_rate': [0.1, 0.5, 1.0],  # Weight applied to each weak learner
#     'algorithm': ['SAMME', 'SAMME.R']  # Algorithm used for boosting
# }

# # Perform grid search cross-validation
# grid_search_adaboost = GridSearchCV(estimator=adaboost_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# # Fit the grid search to the training data
# grid_search_adaboost.fit(X_tr, y)

# # Print the best hyperparameters found
# print("Best hyperparameters for AdaBoost:", grid_search_adaboost.best_params_)
# # Best hyperparameters for AdaBoost: {'algorithm': 'SAMME', 'learning_rate': 1.0, 'n_estimators': 200}


# # Use the best model found by GridSearchCV for AdaBoost
# best_adaboost = grid_search_adaboost.best_estimator_

# # Make predictions on the training and testing sets using the best AdaBoost model
# y_pred_train = best_adaboost.predict(X_tr)
# y_pred_test = best_adaboost.predict(X_te)


# **2nd Highest Score: MLP Classifier with Random Search**
**Score: 0.78970**

MLP **(Multi-Layer Perceptron)** Classifier is extremely useful in datasets which have a combination of **numerical and categorical features**. In this project, the prediction of Rating is heavily dependent on the categorical feature - RecipeReview, which is based on the **user's sentiments**. Also, MLP classifier (being a **neural network**) is extremely competent in handling **non-linear relationships**. Here, there could be a non-linear relationship between UserReputatipn, ThumbsUpCount, ThumbsDownCount and RecipeReview which could non-linearly affect the Ratings.

In [None]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint

# # Create MLP classifier
# mlp = MLPClassifier()

# # Define hyperparameters and their values to tune
# param_dist = {
#     'hidden_layer_sizes': [(50,), (100,), (150,)],  # The number of neurons in each hidden layer.
#     'activation': ['relu', 'tanh'],  # The function applied to the output of each neuron in the hidden layers.
#     'solver': ['adam'],  # The optimization algorithm used to update the weights of the connections.
#     'alpha': [0.0001, 0.001],  # Regularization Parameter: alpha(L2 regularization) to prevent overfitting.
#     'learning_rate': ['constant']  # Determines the step size during weight updates.
# }
# # Neurons in hidden layers usually apply non-linear activation functions such as -
# # ReLU (Rectified Linear Unit), tanh (Hyperbolic Tangent), or sigmoid. They introduce non-linearity to the output
# # of individual neurons to learn complex relationships and patterns in the data.

# n_iter_search = 20

# # Perform randomized search cross-validation
# random_search_mlp = RandomizedSearchCV(estimator=mlp, param_distributions=param_dist, n_iter=n_iter_search, cv=5, scoring='accuracy', n_jobs=-1)

# # Fit the randomized search to the training data
# random_search_mlp.fit(X_tr, y)

# # Print the best hyperparameters found
# print("Best hyperparameters for MLP:", random_search_mlp.best_params_)
# # Best hyperparameters for MLP: {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (150,), 'alpha': 0.001, 'activation': 'relu'}

# # Use the best model found by RandomizedSearchCV for MLP
# best_mlp = random_search_mlp.best_estimator_

# # Make predictions on the training and testing sets using the best MLP model
# y_pred_train = best_mlp.predict(X_tr)
# y_pred_test = best_mlp.predict(X_te)


# **1st Highest Score: MLP Classifier with Grid Search**
**Score: 0.79014**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from scipy.stats import randint

# Define hyperparameters and their values to tune
param_grid = {
    'hidden_layer_sizes': [(150,)],  # The number of neurons in each hidden layer.
    'activation': ['relu'],  # The function applied to the output of each neuron in the hidden layers.
    'solver': ['adam'],  # The optimization algorithm used to update the weights of the connections.
    'alpha': [0.001],  # Regularization Parameter: alpha(L2 regularization) to prevent overfitting.
    'learning_rate': ['constant']  # Determines the step size during weight updates.
}

# Create MLP classifier
mlp = MLPClassifier()

# Perform grid search cross-validation
grid_search_mlp = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the training data
grid_search_mlp.fit(X_tr, y)

# Print the best hyperparameters found
print("Best hyperparameters for MLP:", grid_search_mlp.best_params_)
# Best hyperparameters for MLP: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (150,), 'learning_rate': 'constant', 'solver': 'adam'}

# Use the best model found by GridSearchCV for MLP
best_mlp = grid_search_mlp.best_estimator_

# Make predictions on the training and testing sets using the best MLP model
y_pred_train = best_mlp.predict(X_tr)
y_pred_test = best_mlp.predict(X_te)

# **KNN**
**Score: 0.76572**

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import GridSearchCV

# # Create KNN classifier
# knn = KNeighborsClassifier()

# # Define hyperparameters and their values to tune
# param_grid = {
#     'n_neighbors': [3, 5, 7, 10],           # Number of neighbors
#     'weights': ['uniform', 'distance'],     # Weight function used in prediction
#     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute the nearest neighbors
#     'p': [1, 2]                              # Power parameter for the Minkowski metric
# }

# # Perform grid search cross-validation
# grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# # Fit the grid search to the training data
# grid_search.fit(X_tr, y)

# # Print the best hyperparameters found
# print("Best hyperparameters:", grid_search.best_params_)

# # Use the best model found by GridSearchCV
# best_knn = grid_search.best_estimator_

# # Make predictions on the training and testing sets using the best model
# y_pred_train = best_knn.predict(X_tr)
# y_pred_test = best_knn.predict(X_te)


# **4th Highest Score: SVM**
**Score: 0.7853**

In [None]:
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV

# # Create SVC classifier
# svm = SVC()

# # Define hyperparameters and their values to tune
# param_grid = {
#     'C': [10],               # Regularization parameter 
# #                                             High value of C gives small margin and large classification error
# #                                             Small value of C gives large margin and small error.
#     'kernel': ['rbf'],  # It computes the dot product between the input features and is 
# #                                             suitable for linearly separable datasets.
#     'gamma': ['scale'],            # Kernel coefficient for 'rbf', 'poly', 'sigmoid'
#     'degree': [2]
# }
# # It computes the dot product between the input features and is suitable for linearly separable datasets.
# # The polynomial kernel: polynomial of the dot product of their feature vectors.It introduces nonlinearity
# #                    into the decision boundary, allowing SVMs to capture more complex relationships in the data.
# # The RBF kernel, also known as the Gaussian kernel, computes the similarity between samples based on the 
# #                      Gaussian (radial basis) function.
    
    
# # Perform grid search cross-validation
# grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# # Fit the grid search to the training data
# grid_search.fit(X_tr, y)

# # Print the best hyperparameters found
# print("Best hyperparameters:", grid_search.best_params_)
# Best hyperparameters: {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}

# # Use the best model found by GridSearchCV
# best_svm = grid_search.best_estimator_

# # Make predictions on the training and testing sets using the best model
# y_pred_train = best_svm.predict(X_tr)
# y_pred_test = best_svm.predict(X_te)


# **XGBClassifier**
**Score: 0.7809**

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split, GridSearchCV
# from xgboost import XGBClassifier

# # Fill missing values
# train_dataset = train_dataset.fillna('')

# # Split the dataset
# X = train_dataset["Recipe_Review"]
# y = train_dataset["Rating"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Vectorize the text data
# vectorizer = TfidfVectorizer()

# X_train_vectorized = vectorizer.fit_transform(X_train)
# X_test_vectorized = vectorizer.transform(X_test)

# # Define the parameter grid for hyperparameter tuning
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5]
# }

# # Initialize the XGBoost classifier
# xgb_model = XGBClassifier()

# # Perform grid search cross-validation
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')

# # Fit the grid search to the training data
# # grid_search.fit(X_train_vectorized, y_train)
# grid_search.fit(X_tr, y)

# # Print the best hyperparameters found
# print("Best hyperparameters:", grid_search.best_params_)

# # Use the best model found by GridSearchCV
# best_xgb_model = grid_search.best_estimator_

# # Make predictions on the training and testing sets using the best model
# # y_pred_train = best_xgb_model.predict(X_train_vectorized)
# # y_pred_test = best_xgb_model.predict(X_test_vectorized)
# y_pred_train = best_xgb_model.predict(X_tr)
# y_pred_test = best_xgb_model.predict(X_te)

# # xgb_model.fit(X_train, y_train)
# # y_pred = xgb_model.predict(X_test)

# **3rd Highest Score : Logistic Regression**
**Score: 0.78860**

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV

# LR = LogisticRegression(max_iter=1000)

# # Define hyperparameters and their values to tune
# param_grid = {
#     'penalty': ['l1', 'l2'],  # Regularization penalty
#     'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
#     'solver': ['liblinear', 'saga']  # Optimization algorithm
# }

# # Perform grid search cross-validation
# grid_search = GridSearchCV(estimator=LR, param_grid=param_grid, cv=5, scoring='accuracy')

# # Fit the grid search to the training data
# grid_search.fit(X_tr, y)

# # Print the best hyperparameters found

# print("Best hyperparameters:", grid_search.best_params_)

# # Use the best model found by GridSearchCV
# best_LR = grid_search.best_estimator_

# # Make predictions on the training and testing sets using the best model
# y_pred_train = best_LR.predict(X_tr)
# y_pred_test = best_LR.predict(X_te)

# # LR.fit(X_tr, y)


# **LGBM Classifier**
**Score: 0.78332**

In [None]:
# from lightgbm import LGBMClassifier

# # Initialize the LightGBM classifier
# lgbm = LGBMClassifier()

# # Fit the classifier to the training data
# lgbm.fit(X_tr, y)

# # Make predictions on the training and testing sets
# y_pred_train = lgbm.predict(X_tr)
# y_pred_test = lgbm.predict(X_te)

In [None]:
# from lightgbm import LGBMClassifier
# from sklearn.model_selection import GridSearchCV

# lgbm = LGBMClassifier()

# # Define hyperparameters and their values to tune
# param_grid = {
#     'learning_rate': np.arange(0.01,0.2),
#     'n_estimators': range(50,250),
#     'max_depth': [3, 5, 10, 20, 30, 50],
#     'min_child_samples': [3, 5, 10, 20, 30, 50],
#     'subsample': [0.2, 0.4, 0.6, 0.8, 0.9],
#     'colsample_bytree': [0.2, 0.4, 0.6, 0.8, 0.9],
#     'reg_alpha': [0.0, 0.2, 0.5, 0.7, 1.0],
#     'reg_lambda': [0.0, 0.2, 0.5, 0.7, 1.0],
#     'n_jobs': [-1]
# }

# # param_grid = {
# #         'learning_rate': np.arange( 0.01, 0.2),
# #         'n_estimators': range(50, 250),
# #         'max_depth': range(3, 50),
# #         'min_child_samples': range(3, 50),
# #         'subsample': np.arange(0.2, 0.9),
# #         'colsample_bytree': np.arange(0.2, 0.9),
# #         'reg_alpha': np.arange(0.0, 1.0),
# #         'reg_lambda': np.arange(0.0, 1.0),
# #         'n_jobs': [-1]
# # }

# # Perform grid search cross-validation
# grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=5, scoring='accuracy')

# # Fit the grid search to the training data
# grid_search.fit(X_tr, y)

# # Print the best hyperparameters found

# print("Best hyperparameters:", grid_search.best_params_)

# # Use the best model found by GridSearchCV
# best_LGBM = grid_search.best_estimator_

# # Make predictions on the training and testing sets using the best model
# y_pred_train = best_LGBM.predict(X_tr)
# y_pred_test = best_LGBM.predict(X_te)

# # LR.fit(X_tr, y)

In [None]:
# Modified LGBM

# from lightgbm import LGBMClassifier
# from sklearn.model_selection import GridSearchCV

# lgbm = LGBMClassifier()

# # Define hyperparameters and their values to tune
# param_grid = {
#     'learning_rate': [0.01, 0.1, 0.2],  # Narrowed down the range
#     'n_estimators': [50, 100, 150],       # Narrowed down the range
#     'max_depth': [3, 5, 10],              # Limited the values
#     'min_child_samples': [3, 5, 10],       # Limited the values
#     'subsample': [0.2, 0.6, 0.9],         # Limited the values
#     'colsample_bytree': [0.2, 0.6, 0.9],  # Limited the values
#     'reg_alpha': [0.0, 0.5, 1.0],          # Limited the values
#     'reg_lambda': [0.0, 0.5, 1.0],         # Limited the values
#     'n_jobs': [-1]
# }

# # Perform grid search cross-validation
# grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# # Fit the grid search to the training data
# grid_search.fit(X_tr, y)

# # Print the best hyperparameters found
# print("Best hyperparameters:", grid_search.best_params_)

# # Use the best model found by GridSearchCV
# best_LGBM = grid_search.best_estimator_

# # Make predictions on the training and testing sets using the best model
# y_pred_train = best_LGBM.predict(X_tr)
# y_pred_test = best_LGBM.predict(X_te)


In [None]:
# #Alternate Code:
# #Score: 0.78332

# from lightgbm import LGBMClassifier
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint, uniform

# lgbm = LGBMClassifier()

# # Define hyperparameters and their values to tune
# param_dist = {
#     'learning_rate': uniform(0.01, 0.19),  # Uniform distribution between 0.01 and 0.2
#     'n_estimators': randint(50, 250),       # Discrete uniform distribution between 50 and 250
#     'max_depth': randint(3, 51),            # Discrete uniform distribution between 3 and 50
#     'min_child_samples': randint(3, 51),     # Discrete uniform distribution between 3 and 50
#     'subsample': uniform(0.2, 0.7),         # Uniform distribution between 0.2 and 0.9
#     'colsample_bytree': uniform(0.2, 0.7),  # Uniform distribution between 0.2 and 0.9
#     'reg_alpha': uniform(0.0, 1.0),          # Uniform distribution between 0.0 and 1.0
#     'reg_lambda': uniform(0.0, 1.0),         # Uniform distribution between 0.0 and 1.0
#     'n_jobs': [-1]
# }

# # Perform randomized search cross-validation
# random_search = RandomizedSearchCV(estimator=lgbm, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)

# # Fit the randomized search to the training data
# random_search.fit(X_tr, y)

# # Print the best hyperparameters found
# print("Best hyperparameters:", random_search.best_params_)

# # Use the best model found by RandomizedSearchCV
# best_LGBM = random_search.best_estimator_

# # Make predictions on the training and testing sets using the best model
# y_pred_train = best_LGBM.predict(X_tr)
# y_pred_test = best_LGBM.predict(X_te)


# **Stacking Classifier**

In [None]:
# from sklearn.ensemble import StackingClassifier
# from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
# from sklearn.neural_network import MLPClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from scipy.stats import randint

# # Define base classifiers
# mlp = MLPClassifier()
# lr = LogisticRegression(max_iter=1000)
# svm = SVC()

# # Define hyperparameters and their values to tune for each base classifier
# param_dist_mlp = {
#     'hidden_layer_sizes': [(150,)],
#     'activation': ['relu'],
#     'solver': ['adam'],
#     'alpha': [0.001],
#     'learning_rate': ['constant']
# }

# param_dist_lr = {
#     'penalty': ['l1', 'l2'],
#     'C': [0.001, 0.01, 0.1, 1, 10, 100],
#     'solver': ['liblinear', 'saga']
# }

# param_dist_svm = {
#     'C': [10],
#     'degree': [2],
#     'kernel': ['rbf'],
#     'gamma': ['scale']
# }

# # Define the Stacking Classifier
# estimators = [
#     ('mlp', MLPClassifier()),
#     ('lr', LogisticRegression(max_iter=1000)),
#     ('svm', SVC())
# ]
# stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=MLPClassifier())

# # Define hyperparameters and their values to tune for the stacking classifier
# param_dist_stacking = {
#     'final_estimator__hidden_layer_sizes': [(100,)],
#     'final_estimator__activation': ['relu'],
#     'final_estimator__solver': ['adam'],
#     'final_estimator__alpha': [0.0001],
#     'final_estimator__learning_rate': ['constant']
# }

# # Define StratifiedKFold for cross-validation
# cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# # Perform Randomized Search CV for Stacking Classifier
# random_search_stacking = RandomizedSearchCV(estimator=stacking_classifier, param_distributions=param_dist_stacking, n_iter=10, cv=cv, scoring='accuracy', n_jobs=-1)

# # Fit the randomized search to the training data
# random_search_stacking.fit(X_tr, y)

# # Print the best hyperparameters found
# print("Best hyperparameters for Stacking Classifier:", random_search_stacking.best_params_)
# # Best hyperparameters for Stacking Classifier: {'final_estimator__solver': 'adam', 'final_estimator__learning_rate': 'constant', 'final_estimator__hidden_layer_sizes': (100,), 'final_estimator__alpha': 0.0001, 'final_estimator__activation': 'relu'}

# # Use the best model found by RandomizedSearchCV for Stacking Classifier
# best_stacking = random_search_stacking.best_estimator_

# # Make predictions on the training and testing sets using the best Stacking Classifier model
# y_pred_train_stacking = best_stacking.predict(X_tr)
# y_pred_test_stacking = best_stacking.predict(X_te)


# **Submission Code**

In [None]:
ID=[]
for i in range (1,len(y_pred_test)+1):
    ID.append(i)
final_pred = pd.DataFrame({"ID":ID,"total_amount":y_pred_test})
final_pred.to_csv("submission.csv",index=False)