In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import pickle


In [119]:

df = pd.read_csv("indian_food.csv")


In [120]:
df = df.replace('-1', 'unknown')

In [121]:
df['recipe_category'] = df['course'] + ' ' + df['flavor_profile'] + ' ' + df['diet']


In [122]:
df.head()

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region,recipe_category
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,45,25,sweet,dessert,West Bengal,East,dessert sweet vegetarian
1,Boondi,"Gram flour, ghee, sugar",vegetarian,80,30,sweet,dessert,Rajasthan,West,dessert sweet vegetarian
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,15,60,sweet,dessert,Punjab,North,dessert sweet vegetarian
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,15,30,sweet,dessert,Rajasthan,West,dessert sweet vegetarian
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,15,40,sweet,dessert,West Bengal,East,dessert sweet vegetarian


In [123]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['ingredients'])

In [124]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['recipe_category'])

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, df['recipe_category'], test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)


In [126]:
y_pred=clf.predict(X_test)

In [127]:
score=accuracy_score(y_test,y_pred)

In [128]:
score

0.6078431372549019

In [129]:
df['recipe_category'].value_counts()


recipe_category
dessert sweet vegetarian              85
main course spicy vegetarian          72
snack spicy vegetarian                35
main course spicy non vegetarian      24
main course unknown vegetarian        23
main course sweet vegetarian           3
main course bitter vegetarian          3
snack unknown vegetarian               3
main course unknown non vegetarian     3
starter spicy non vegetarian           2
main course sour vegetarian            1
snack bitter vegetarian                1
Name: count, dtype: int64

In [130]:
best_accuracy = 0
best_k = 0
for k in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k

print(f'Best K: {best_k} with Accuracy: {best_accuracy}')

Best K: 13 with Accuracy: 0.6666666666666666


In [131]:
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)

In [132]:
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [133]:
accuracy

0.6666666666666666

In [134]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", accuracy_lr)


Logistic Regression Accuracy: 0.6274509803921569


In [135]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf') 
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)


SVM Accuracy: 0.6078431372549019


In [136]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy:", accuracy_gb)

Gradient Boosting Accuracy: 0.5882352941176471


In [137]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
print("Neural Network Accuracy:", accuracy_mlp)


Neural Network Accuracy: 0.6078431372549019


In [138]:
from sklearn.model_selection import cross_val_score


rf = RandomForestClassifier(n_estimators=100, random_state=42)
cv_scores_rf = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')

print("Random Forest Cross-Validation Scores:", cv_scores_rf)
print("Mean Cross-Validation Accuracy:", cv_scores_rf.mean())




Random Forest Cross-Validation Scores: [0.58536585 0.56097561 0.65853659 0.51219512 0.625     ]
Mean Cross-Validation Accuracy: 0.5884146341463415


In [139]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)

cv_scores_lr = cross_val_score(lr, X_train, y_train, cv=5, scoring='accuracy')

print("Logistic Regression Cross-Validation Scores:", cv_scores_lr)
print("Mean Cross-Validation Accuracy:", cv_scores_lr.mean())


Logistic Regression Cross-Validation Scores: [0.51219512 0.58536585 0.56097561 0.53658537 0.525     ]
Mean Cross-Validation Accuracy: 0.5440243902439025




In [140]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['ingredients'])
print(len(vectorizer.get_feature_names_out())) 

331


In [141]:
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)

In [142]:
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [143]:
accuracy

0.6666666666666666

In [144]:
with open('model/knn_model.pkl', 'wb') as f:
    pickle.dump(knn, f)


In [145]:
with open('model/tfidf_vectorizer.pkl','wb') as f:
    pickle.dump(vectorizer,f)


In [146]:
df = df.replace('-1', 'unknown')

In [147]:
df.to_csv('final_indian_food.csv')