### Rui Alexandre Tapadinhas

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.combine import SMOTETomek
from sklearn.decomposition import PCA


In [2]:

# Load the dataset from the JSON file
df_train = pd.read_json('dataset/train.json')

# Create a column with all ingredients as a single string
df_train['all_ingredients'] = df_train['ingredients'].apply(lambda x: ' '.join(x))

df_train


Unnamed: 0,id,cuisine,ingredients,all_ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olives grape tomatoes ga...
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground pepper salt tomatoes ground...
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs pepper salt mayonaise cooking oil green c...
3,22213,indian,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",black pepper shallots cornflour cayenne pepper...
...,...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ...",light brown sugar granulated sugar butter warm...
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b...",KRAFT Zesty Italian Dressing purple onion broc...
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte...",eggs citrus fruit raisins sourdough starter fl...
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli...",boneless chicken skinless thigh minced garlic ...


In [3]:
class_counts = df_train['cuisine'].value_counts()
print("Class Counts:")
print(class_counts)

Class Counts:
cuisine
italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: count, dtype: int64


### Vectorize with TFIDF

In [4]:
# Vectorize the ingredients
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_train['all_ingredients'])

# Define the target variable
y = df_train['cuisine']



### PCA to reduce dimensions

In [5]:
pca = PCA(n_components=0.8)  # Keep 95% of the variance
X_reduced = pca.fit_transform(X.toarray())

In [6]:
X_reduced.shape

(39774, 433)

### Train test split

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)


In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((31819, 433), (7955, 433), (31819,), (7955,))

### Balance Dataset using SMOTE to oversample minority classes

In [9]:
smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X_train, y_train)

In [10]:
X_train, y_train = X_resampled, y_resampled

### Train the Random Forest Classifier

In [11]:
# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)


In [16]:
# Predict and evaluate the model
X_test = pca.transform(X_test.toarray())
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

ValueError: Found input variables with inconsistent numbers of samples: [7955, 9944]

## Test with test.json data

In [13]:
# Load the dataset from the JSON file
df_test = pd.read_json('dataset/test.json')

df_test.to_csv('dataset/test.csv', index=False)

# Create a column with all ingredients as a single string
df_test['all_ingredients'] = df_test['ingredients'].apply(lambda x: ' '.join(x))

X_test = vectorizer.transform(df_test['all_ingredients'])

X_test_reduced = pca.transform(X_test.toarray())

df_test['prediction'] = model.predict(X_test_reduced)

output_df = pd.DataFrame()
output_df['id'] = df_test['id']
output_df['cuisine'] = df_test['prediction']

output_df.to_csv('predictions_RF.csv', index=False)

df_test


Unnamed: 0,id,ingredients,all_ingredients,prediction
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",baking powder eggs all-purpose flour raisins m...,russian
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",sugar egg yolks corn starch cream of tartar ba...,southern_us
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",sausage links fennel bulb fronds olive oil cub...,italian
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",meat cuts file powder smoked sausage okra shri...,cajun_creole
4,35687,"[ground black pepper, salt, sausage casings, l...",ground black pepper salt sausage casings leeks...,italian
...,...,...,...,...
9939,30246,"[large egg yolks, fresh lemon juice, sugar, bo...",large egg yolks fresh lemon juice sugar bourbo...,french
9940,36028,"[hot sauce, butter, sweet potatoes, adobo sauc...",hot sauce butter sweet potatoes adobo sauce salt,southern_us
9941,22339,"[black pepper, salt, parmigiano reggiano chees...",black pepper salt parmigiano reggiano cheese r...,italian
9942,42525,"[cheddar cheese, cayenne, paprika, plum tomato...",cheddar cheese cayenne paprika plum tomatoes g...,southern_us


In [14]:

# Predict on new data (example)
test_data = [
    {
        "id": 1,
        "ingredients": [
            "romaine lettuce",
            "black olives",
            "grape tomatoes",
            "feta cheese crumbles"
        ]
    }
]

# Preprocess the test data
test_df = pd.DataFrame(test_data)
test_df['all_ingredients'] = test_df['ingredients'].apply(lambda x: ' '.join(x))
X_new = vectorizer.transform(test_df['all_ingredients'])

# Predict the cuisine
predictions = model.predict(X_new)
print(predictions)


ValueError: X has 3010 features, but RandomForestClassifier is expecting 433 features as input.