### load the dataset

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_json('train.json')
test = pd.read_json('test.json')

In [3]:
train.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
print(f"There are {train.shape[0]} data points in the train dataset")

There are 39774 data points in the train dataset


In [5]:
test.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [6]:
print(f"There are {test.shape[0]} data points in the test dataset")

There are 9944 data points in the test dataset


### feature engineering

In [7]:
processed_feature = []
for i in train['ingredients']:
    intermediate = []
    for x in i:
        if x not in stopwords.words('english'):# remove stopwords, which the dataset might not have
            x.lower()
            x = re.sub('[^a-zA-Z]', ' ', x) # remove punctuation
            x = re.sub((r'\b(oz|ounc|ounce|pound|lb|inch|inches|kg|to)\b'), ' ', x) # remove units
        intermediate.append(x)
    processed_feature.append(intermediate)

In [8]:
processed_test_feature = []
for i in test['ingredients']:
    intermediate = []
    for x in i:
        if x not in stopwords.words('english'):
            x.lower()
            x = re.sub('[^a-zA-Z]', ' ', x)
            x = re.sub((r'\b(oz|ounc|ounce|pound|lb|inch|inches|kg|to)\b'), ' ', x)
        intermediate.append(x)
    processed_test_feature.append(intermediate)

In [9]:
len(processed_feature)

39774

In [10]:
len(processed_test_feature)

9944

### preprocess the target column

In [11]:
le = LabelEncoder()

In [12]:
ytrain = le.fit_transform(train['cuisine'])

In [13]:
le.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'], dtype=object)

### preprocess the training column

In [14]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,1), binary=True, max_df=0.99)

In [15]:
Xtrain = vectorizer.fit_transform([str(i) for i in processed_feature])
Xtest = vectorizer.transform([str(i) for i in processed_test_feature])

In [16]:
Xtrain.toarray().shape # there are 2996 words got vectorized

(39774, 2996)

In [17]:
Xtest.toarray().shape

(9944, 2996)

### build a model 

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [19]:
rf = RandomForestClassifier(n_estimators=600)

In [20]:
rf.fit(Xtrain, ytrain)

RandomForestClassifier(n_estimators=600)

In [21]:
kfold = KFold(n_splits=5)

In [22]:
scores = cross_val_score(rf, Xtrain, ytrain, cv=kfold)

In [23]:
mean_score = np.mean(scores)

In [26]:
mean_score

0.761226047257589

In [24]:
prediction = rf.predict(Xtest)

In [28]:
prediction = le.inverse_transform(prediction)

In [29]:
prediction_df = pd.DataFrame({'id':test.id, 'cuisine':prediction}, columns=['id', 'cuisine'])

In [30]:
prediction_df

Unnamed: 0,id,cuisine
0,18009,irish
1,28583,southern_us
2,41580,italian
3,29752,cajun_creole
4,35687,italian
...,...,...
9939,30246,french
9940,36028,southern_us
9941,22339,italian
9942,42525,southern_us
