In [None]:
# Imports
  import pandas as pd
  import numpy as np
  import matplotlib.pylab as plt
  from collections import Counter
  import matplotlib as mpl
  from sklearn import preprocessing
  from sklearn.metrics import accuracy_score

# Read the data
df = pd.read_csv('/content/Dataset .csv')
df.head()

# Create empty list to store recipe features
features_all_list = []
16
17# Extract the features from each recipe (need a global list)
for i in df.ingredients:
    features_all_list += i

# Remove duplicate features using default set behavior
features = list( set(features_all_list) )

len(features)

 onehot_ingredients = np.zeros((df.shape[0], len(features)))

 # Index the features (ingredients) alphabetically
 feature_lookup = sorted(features)

 # For each recipe look up ingredient position in the sorted ingredient list
 # If that ingredient exists, set the appropriate column equal to 1
 ## This will take 1-2 minutes to finish running
for index, row in df.iterrows():
    for ingredient in row['ingredients']:
       onehot_ingredients[index, feature_lookup.index(ingredient)] = 1

y = df.cuisine.values.reshape(-1,1)
#Using the indices of the ingredients, we can reduce the amount of string matching required to one-hot encode the ingredients into binary features.

 # Create a dataframe
 df_features = pd.DataFrame(onehot_ingredients)

 # Create empty dictionary to store featureindex:columnname
 df = {}

 # For each feature, fetch the column name
 for i in range(len(features)):
     df[df_features.columns[i]] = features[i]

# Rename the features (stop using the index # and use the actual text)
df_features = df_features.rename(columns=d)
df_features.shape()
#The shape of df_features is (39774, (6714) meaning we have 39774 recipes and 6714 unique ingredients.


#CUISINE CLASSIFICATION
#In order to classify with best practices in mind, we need to ensure that we split the data into train and test sets. This step will help prevent overfitting. Completing this step prior to training all of the models allows us to use the same train and test data across models. Note that we are using the shuffle feature to rearrange the recipes (in case the order was not originally random) and test_size=0.2 indicating that we want 80% of the data reserved for training and 20% for testing.

# Import train_test_split
from sklearn.model_selection import train_test_split

# Split into train, test
X_train, X_test, y_train, y_test = train_test_split(df_features, y, test_size=0.2, shuffle=True, random_state=42)
#DECISION TREE
#The first model that we fit is a basic, unpruned decision tree. We use this model as a baseline for performance in the classification task.

 # Import decision tree from sklearn
from sklearn.tree import DecisionTreeClassifier

 # Set up the decision tree
 clf = DecisionTreeClassifier(max_features=5000)

 # Fit the decision tree to the training data
 clf.fit(X_train, y_train)

# Use the decision tree to predict values for the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy score and print the results
a = accuracy_score(y_test, y_pred)
print("Accuracy Score in % : ")
print(a * 100)
#For this first decision tree, the potentially unbiased test error is estimated to be 60.68%. For context, a human typically can classify recipes into the correct cuisine in 45-50% of attempts. The max-depth of this decision tree was 403 splits which could indicate overfitting. Ideally, we would tune the max-depth hyperparameter but since we only need a baseline, this number will suffice.

#RANDOM FOREST
#The second model chosen is an ensemble method known as ‘random forest’. You can read more about it on Wikipedia. While the decision tree serves only as a baseline classifier, with the Random Forest we want to tune the model’s hyper-parameters. For example, we tuned each of the following independently and then also used them as a basis for tuning a combination: maximum tree depth, number of trees in the forest, maximum number of features considered at each split, and minimum number of samples per split. The hyperparameter tuning code is not shown below but can be provided on request.

 # Import random forest classifier from sklearn
 from sklearn.ensemble import RandomForestClassifier

 # Set up random forest classifier
 clf = RandomForestClassifier()

 # Train the random forest (use ravel to coerce to 1d array)
 clf.fit(X_train, y_train.ravel())

# Get test predictions
y_pred = clf.predict(X_test)

# Get accuracy for the random forest classifier
a = accuracy_score(y_test, y_pred)
print("Accuracy Score in % : ")
print(a * 100)
#By tuning the random forest, we were able to increase test accuracy from 67.11% to 71.64% by setting max_depth=200, n_estimators=250, max_features=‘sqrt’, and min_samples_split=7.

# Setting up the tuned random forest
clf = RandomForestClassifier(max_depth=200, n_estimators=250, max_features='sqrt', min_samples_split=7)
#After training the random forest model, we can extract information about the relative importance of each feature (ingredient) in determining the class (cuisine) of a given recipe. The variable importance plot below shows how acai juice and nori furikake are considered distinguishing ingredients.



#MULTINOMIAL LOGISTIC REGRESSION
#To compete with the random forest, we trained a multinomial logistic regression.

 # import logistic regresion
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score

 # Set up and fitlogistic regression
 clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X_train, y_train.ravel())

 # Get predictions on test data
 y_pred = clf.predict(X_test)

# Get accuracy
a = accuracy_score(y_test, y_pred)
print("Appropriate tests in % : ")
print(a * 100)
#We were surprised by the performance of the logistic regression because it scored 78.14% test accuracy (6.5% better than the random forest). We believe that the number of features and sparseness of data is problematic for the random forest algorithm.


#CUISINE CLUSTERING
#In order to perform clustering at the cuisine level, we must aggregate the recipes to cuisine levels.

 # Group by cuisine and aggregate the data
 data_agg = df.groupby('cuisine').apply(lambda x: x.sum())
 data_agg = data_agg.drop(columns=['cuisine','id'])
 data_agg = data_agg.reset_index()

 ## Get all of the unique ingredients as features
 features_all_list = []

 for i in df.ingredients:
    features_all_list += i

    features = list(set(features_all_list))
    len(features)

 onehot_ingredients = np.zeros((data_agg.shape[0], len(features)))
  feature_lookup = sorted(features)
#After applying tf-idf vectorization to standardize the data and principle components analysis (PCA) to reduce the dimensionality of the data (neither shown),we can apply a clustering algorithm. For simplicity and since we have labeled data, we chose K-Means. Other options such as Gaussian mixture models and hierarchical clustering could improve the clusters but were determined not to be necessary for this clustering task.

 # Import Kmeans clustering
 from sklearn.cluster import KMeans

 # Set # of clusters
 ## We tried 3, 4, 5, 6, 7, 8, and 10 with 5 being the best
 numOfClusters = 5

 # Set up KMeans
kmeans = KMeans(init='k-means++', n_clusters=numOfClusters, n_init=10)
# Fit kmeans
kmeans.fit(reduced_data)

# Predict kmeans
kmeans_pred = kmeans.predict(reduced_data)
kmeans_pred = kmeans_pred + 1
kmeans = kmeans.fit(reduced_data)

kmeans.predict.aggregate(columns_stack)
kmeans.predict.feature_lookup()

#Generate plot

17# Generate plot of the resultant clusters
x = reduced_data[:, 0]
y = reduced_data[:, 1]

# Set font size
plt.rcParams.update({'font.size':15
                    })

# Get fig, ax, and set figure size
fig, ax = plt.subplots(figsize=(10,10))

# Scatter the cuisines
ax.scatter(x, y, s=5000, c=kmeans_pred, cmap='Set3')
# Add labels to each cuisine
for i, txt in enumerate(data_agg.cuisine):
    ax.annotate(txt, (x[i], y[i]))