In [2]:
!pip install scikit-learn
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [1]:
import json

def read_preferences(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
        return data["user"]["preferences"]

In [2]:
preferences = read_preferences('./user_data.json')

In [3]:
from sklearn import tree
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [4]:
#Generate input based on user preference
def generate_input():
    preferences = read_preferences('./user_data.json')
    data = []

    for family, location, color in zip(preferences['favorite']['family'], preferences['favorite']['location'], preferences['favorite']['dominated_colors_name']):
        data.append([family, location, color])
    for family, location, color in zip(preferences['not_favorite']['family'], preferences['not_favorite']['location'], preferences['not_favorite']['dominated_colors_name']):
        data.append([family, location, color])

    result = ['favorite'] * 10 + ['not_favorite'] * 10
    return data, result


In [44]:
import random
def predict_favorite(family, location, dominated_colors_name ):
    
    data, result = generate_input()
    
    families = set(x[0] for x in data)
    locations = set(x[1] for x in data)
    colors = set(x[2] for x in data)

    
    # if the user input have not been used in the dataset we get random values from 
    if family not in families:
        family = random.choice(list(families))
    if location not in locations:
        location = random.choice(list(locations))
    
    if dominated_colors_name not in colors:
        dominated_colors_name = random.choice(list(colors))
    
    dataframe = pd.DataFrame(data, columns=['family', 'location', 'dominated_colors_name'])
    resultframe = pd.DataFrame(result, columns=['favorite'])

    #generating numerical labels
    le1 = LabelEncoder()
    dataframe['family'] = le1.fit_transform(dataframe['family'])

    le2 = LabelEncoder()
    dataframe['location'] = le2.fit_transform(dataframe['location'])

    le3 = LabelEncoder()
    dataframe['dominated_colors_name'] = le3.fit_transform(dataframe['dominated_colors_name'])

    le4 = LabelEncoder()
    resultframe['favorite'] = le4.fit_transform(resultframe['favorite'])

    rfc = RandomForestClassifier(n_estimators=10, max_depth=2,
                      random_state=0)
    rfc = rfc.fit(dataframe, resultframe.values.ravel())

    prediction = rfc.predict([[le1.transform([family])[0], le2.transform([location])[0], le3.transform([dominated_colors_name])[0]]])

    return le4.inverse_transform(prediction)[0]

In [45]:
print(predict_favorite('Onoserideae','South America','snow'))
print(predict_favorite('Asteraceae','Greece','darkslategrey'))

[['Onoserideae', 'South America', 'snow'], ['Notocacteae', 'South America', 'dimgrey'], ['Plantaginaceae', 'South America', 'darkslategrey'], ['Bombacoideae', 'South America', 'darkkhaki'], ['Gladiolus', 'Switzerland', 'darkolivegreen'], ['Armeria', 'South America', 'darkkhaki'], ['Sapindaceae', 'Africa', 'silver'], ['Canis', 'United States of America', 'darkolivegreen'], ['Vochysiaceae', 'South America', 'darkolivegreen'], ['Browningia', 'South America', 'grey'], ['Notocacteae', 'South America', 'black'], ['Nardeae', 'Europe', 'darkkhaki'], ['Anaxeton', 'Africa', 'darkgrey'], ['Bunium', 'Switzerland', 'darkolivegreen'], ['Proteaceae', 'South America', 'lightgrey'], ['Cervus', 'United States of America', 'darkolivegreen'], ['Rusa', 'United States of America', 'black'], ['Alopecurus', 'South America', 'darkseagreen'], ['Malveae', 'South America', 'gainsboro'], ['Poaceae', 'South America', 'dimgrey']]
['favorite', 'favorite', 'favorite', 'favorite', 'favorite', 'favorite', 'favorite', 'f



In [52]:
# Test with known data that are not randomized to do assertion on the outputs
def predict_favorite(family, location, dominated_colors_name ):
    
    data = [['Onoserideae', 'South America', 'snow'], ['Notocacteae', 'South America', 'dimgrey'], ['Plantaginaceae', 'South America', 'darkslategrey'], ['Bombacoideae', 'South America', 'darkkhaki'], ['Gladiolus', 'Switzerland', 'darkolivegreen'], ['Armeria', 'South America', 'darkkhaki'], ['Sapindaceae', 'Africa', 'silver'], ['Canis', 'United States of America', 'darkolivegreen'], ['Vochysiaceae', 'South America', 'darkolivegreen'], ['Browningia', 'South America', 'grey'], ['Notocacteae', 'South America', 'black'], ['Nardeae', 'Europe', 'darkkhaki'], ['Anaxeton', 'Africa', 'darkgrey'], ['Bunium', 'Switzerland', 'darkolivegreen'], ['Proteaceae', 'South America', 'lightgrey'], ['Cervus', 'United States of America', 'darkolivegreen'], ['Rusa', 'United States of America', 'black'], ['Alopecurus', 'South America', 'darkseagreen'], ['Malveae', 'South America', 'gainsboro'], ['Poaceae', 'South America', 'dimgrey']]
    result = ['favorite', 'favorite', 'favorite', 'favorite', 'favorite', 'favorite', 'favorite', 'favorite', 'favorite', 'favorite', 'not_favorite', 'not_favorite', 'not_favorite', 'not_favorite', 'not_favorite', 'not_favorite', 'not_favorite', 'not_favorite', 'not_favorite', 'not_favorite']
    
    families = set(x[0] for x in data)
    locations = set(x[1] for x in data)
    colors = set(x[2] for x in data)

    
    # if the user input have not been used in the dataset we get random values from 
    if family not in families:
        family = random.choice(list(families))
    if location not in locations:
        location = random.choice(list(locations))
    
    if dominated_colors_name not in colors:
        dominated_colors_name = random.choice(list(colors))
    
    dataframe = pd.DataFrame(data, columns=['family', 'location', 'dominated_colors_name'])
    resultframe = pd.DataFrame(result, columns=['favorite'])

    #generating numerical labels
    le1 = LabelEncoder()
    dataframe['family'] = le1.fit_transform(dataframe['family'])

    le2 = LabelEncoder()
    dataframe['location'] = le2.fit_transform(dataframe['location'])

    le3 = LabelEncoder()
    dataframe['dominated_colors_name'] = le3.fit_transform(dataframe['dominated_colors_name'])

    le4 = LabelEncoder()
    resultframe['favorite'] = le4.fit_transform(resultframe['favorite'])

    rfc = RandomForestClassifier(n_estimators=10, max_depth=2,
                      random_state=0)
    rfc = rfc.fit(dataframe, resultframe.values.ravel())

    prediction = rfc.predict([[le1.transform([family])[0], le2.transform([location])[0], le3.transform([dominated_colors_name])[0]]])

    return le4.inverse_transform(prediction)[0]

assert predict_favorite('Onoserideae','South America','snow') == 'favorite' 
assert predict_favorite('Poaceae','South America','dimgrey') == 'not_favorite'

