# Notebook d'exploration : sélection de features pertinents et choix des étapes de nettoyage

In [687]:
import numpy as np
import pandas as pd
from typing import List

In [688]:
plants = pd.read_csv('../data/raw/plant_details_all.csv')
plants.head()

Unnamed: 0,id,common_name,scientific_name,other_name,family,origin,type,dimension,cycle,attracts,...,hardiness_location.full_url,hardiness_location.full_iframe,watering_general_benchmark.value,watering_general_benchmark.unit,depth_water_requirement.unit,depth_water_requirement.value,pruning_count.amount,pruning_count.interval,volume_water_requirement.unit,volume_water_requirement.value
0,1,European Silver Fir,['Abies alba'],['Common Silver Fir'],,"['Austria', 'Germany', 'Switzerland', 'France'...",tree,Height: 60 feet,Perennial,[],...,https://perenual.com/api/hardiness-map?species...,<iframe frameborder=0 scrolling=yes seamless=s...,,days,,,,,,
1,5,Fraser Fir,['Abies fraseri'],['Southern Fir'],Pinaceae,['Southeastern United States'],tree,Height: 35 feet,Perennial,[],...,https://perenual.com/api/hardiness-map?species...,<iframe frameborder=0 scrolling=yes seamless=s...,,days,,,,,,
2,6,Golden Korean Fir,"[""Abies koreana 'Aurea'""]",[],Pinaceae,['North and South Korea'],tree,Height: 20 feet,Perennial,[],...,https://perenual.com/api/hardiness-map?species...,<iframe frameborder=0 scrolling=yes seamless=s...,,days,,,,,,
3,8,Blue Spanish Fir,"[""Abies pinsapo 'Glauca'""]",['Glaucous Spanish Fir'],Pinaceae,['Southern Spain'],tree,Height: 45 feet,Perennial,[],...,https://perenual.com/api/hardiness-map?species...,<iframe frameborder=0 scrolling=yes seamless=s...,,days,inches,2.0,2.0,yearly,,
4,9,Noble Fir,['Abies procera'],"['Red Fir', 'White Fir']",Pinaceae,"['United States', 'Canada', 'Mexico', 'Central...",tree,Height: 90 feet,Perennial,"['Squirrels', ' Birds']",...,https://perenual.com/api/hardiness-map?species...,<iframe frameborder=0 scrolling=yes seamless=s...,,days,,,,,,


In [689]:
plants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1839 entries, 0 to 1838
Data columns (total 68 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                1839 non-null   int64  
 1   common_name                       1839 non-null   object 
 2   scientific_name                   1839 non-null   object 
 3   other_name                        1839 non-null   object 
 4   family                            1734 non-null   object 
 5   origin                            1839 non-null   object 
 6   type                              1834 non-null   object 
 7   dimension                         1839 non-null   object 
 8   cycle                             1839 non-null   object 
 9   attracts                          1839 non-null   object 
 10  propagation                       1839 non-null   object 
 11  watering                          1839 non-null   object 
 12  depth_

In [690]:
plants.shape

(1839, 68)

In [691]:
# Faisons une première sélection des colonnes qui semblent pertinentes pour notre recommendation de plantes :

relevant_features = ['common_name', 'scientific_name', 'type', 'soil', 'cycle', 'attracts', 'watering',     
                     'maintenance', 'care_level', 'sunlight', 'growth_rate', 'drought_tolerant',  
                     'salt_tolerant', 'thorny', 'poisonous_to_humans', 'poisonous_to_pets', 'invasive', 'edible_fruit', 'medicinal',
                     'hardiness.min', 'hardiness.max']

In [692]:
plants_selection = plants[relevant_features]

## Analyse colonne "Maintenance" :

In [693]:
plants_selection.isnull().sum() # Choisir entre colonne "maintenance" et "care_level" : "maintenance car moins de valeurs manquantes"

common_name              0
scientific_name          0
type                     5
soil                     0
cycle                    0
attracts                 0
watering                 0
maintenance             83
care_level             286
sunlight                 0
growth_rate              0
drought_tolerant         0
salt_tolerant            0
thorny                   0
poisonous_to_humans      0
poisonous_to_pets        0
invasive                 0
edible_fruit             0
medicinal                0
hardiness.min            3
hardiness.max            3
dtype: int64

In [694]:
plants_selection.loc[:,'maintenance'] = plants_selection['maintenance'].str.lower()
plants_selection['maintenance'].value_counts()

maintenance
low                                                                                            1088
moderate                                                                                        521
high                                                                                             47
http://perenual.com/api/species-care-guide-list?species_id=128&key=sk-hwmo67d1db5d934599114       1
http://perenual.com/api/species-care-guide-list?species_id=129&key=sk-hwmo67d1db5d934599114       1
                                                                                               ... 
http://perenual.com/api/species-care-guide-list?species_id=339&key=sk-hwmo67d1db5d934599114       1
http://perenual.com/api/species-care-guide-list?species_id=341&key=sk-hwmo67d1db5d934599114       1
http://perenual.com/api/species-care-guide-list?species_id=342&key=sk-hwmo67d1db5d934599114       1
http://perenual.com/api/species-care-guide-list?species_id=343&key=sk-hwmo67d1db5d934599

In [695]:
plants_selection.loc[:,'care_level'] = plants_selection['care_level'].str.lower()
plants_selection['care_level'].value_counts()

care_level
medium                                                                                                                                                                                                1139
moderate                                                                                                                                                                                               193
low                                                                                                                                                                                                     60
[]                                                                                                                                                                                                      47
high                                                                                                                                                                             

In [696]:
maintenance_level = ['low', 'moderate', 'high']

print(f"{(plants_selection.loc[~plants_selection['maintenance'].isin(maintenance_level)].shape[0]/plants_selection.shape[0]):.1%} des données autres que Low, Moderate, High")

# On les remplace par np.nan
plants_selection.loc[~plants_selection['maintenance'].isin(maintenance_level), 'maintenance'] = np.nan

10.0% des données autres que Low, Moderate, High


In [697]:
plants_selection.isnull().sum()

common_name              0
scientific_name          0
type                     5
soil                     0
cycle                    0
attracts                 0
watering                 0
maintenance            183
care_level             286
sunlight                 0
growth_rate              0
drought_tolerant         0
salt_tolerant            0
thorny                   0
poisonous_to_humans      0
poisonous_to_pets        0
invasive                 0
edible_fruit             0
medicinal                0
hardiness.min            3
hardiness.max            3
dtype: int64

In [698]:
# Imputation : sur les 183 valeurs manquantes dans la colonne 'maintenance', 70 peuvent être remplis en utilisant la colonne 'care_level'
maintenance_null = (plants_selection['maintenance'].isnull())
care_level_not_null = (~plants_selection['care_level'].isnull())
care_level_ok = (plants_selection['care_level'].isin(['medium', 'moderate', 'high', 'low', 'easy']))

maintenance_care = plants_selection.loc[maintenance_null & care_level_not_null & care_level_ok, ['maintenance', 'care_level']]
maintenance_care

Unnamed: 0,maintenance,care_level
0,,medium
4,,medium
5,,medium
9,,high
10,,medium
...,...,...
203,,medium
205,,medium
206,,medium
208,,medium


In [699]:
# Les autres colonnes peuvent être remplis en utilisant la colonne 'watering' : Average -> Moderate, Minimum -> Low, 'Frequent' -> High
plants_selection.loc[:,'watering'] = plants_selection['watering'].str.lower()
plants_selection['watering'].value_counts()

watering
average     1205
minimum      428
frequent     206
Name: count, dtype: int64

In [700]:
plants_selection.loc[maintenance_null & ~(care_level_not_null & care_level_ok), ['maintenance', 'care_level', 'watering']]

Unnamed: 0,maintenance,care_level,watering
11,,,average
19,,,average
29,,,frequent
38,,,frequent
40,,,frequent
...,...,...,...
197,,"['diptera - true flies', ' leaf miner insect',...",average
198,,"['diptera - true flies', ' leaf miner insect',...",average
199,,"['pest resistant', ' disease resistant']",average
204,,,frequent


In [701]:
def impute_with_care_level(x):
    match x:
        case "medium":
            return "moderate"
        case "easy":
            return "low"
        case _:
            return x
    
def impute_with_watering(x):
    match x: 
        case "average":
            return "moderate"
        case "minimum":
            return "low"
        case "frequent":
            return "high"
        case _: 
            return x

mask = maintenance_null & care_level_not_null & care_level_ok
mask_ = maintenance_null & ~(care_level_not_null & care_level_ok)
plants_selection.loc[mask, 'maintenance'] = plants_selection.loc[mask, 'care_level'].apply(lambda x: impute_with_care_level(x))
plants_selection.loc[mask_, 'maintenance'] = plants_selection.loc[mask_, 'watering'].apply(lambda x: impute_with_watering(x))

In [702]:
# On vérifie qu'on a bien que modalité : "High, Low, Moderate":
plants_selection['maintenance'].value_counts()

maintenance
low         1093
moderate     674
high          72
Name: count, dtype: int64

In [703]:
# Nombre de valeurs manquantes restantes
plants_selection.isnull().sum()['maintenance']

np.int64(0)

In [704]:
# On supprime les colonnes 'care_level' et  'watering'
plants_selection = plants_selection.drop(columns=['care_level', 'watering'])

In [705]:
plants_selection.isnull().sum()

common_name            0
scientific_name        0
type                   5
soil                   0
cycle                  0
attracts               0
maintenance            0
sunlight               0
growth_rate            0
drought_tolerant       0
salt_tolerant          0
thorny                 0
poisonous_to_humans    0
poisonous_to_pets      0
invasive               0
edible_fruit           0
medicinal              0
hardiness.min          3
hardiness.max          3
dtype: int64

## Analyse colonne "Type"

In [706]:
plants_selection.loc[:,'type'] = plants_selection['type'].str.lower()
plants_selection['type'].value_counts(normalize=True)

type
tree                   0.176663
herb                   0.156489
flower                 0.140676
deciduous shrub        0.118866
broadleaf evergreen    0.091603
shrub                  0.073610
bulb                   0.048528
fern                   0.044166
ornamental grass       0.020174
vine                   0.012541
begonia                0.012541
vegetable              0.011450
cactus                 0.011450
coneflower             0.010360
needled evergreen      0.008724
rush or sedge          0.008724
turfgrass              0.006543
fruit                  0.005453
palm or cycad          0.004907
aster                  0.004362
thistle                0.003817
carnivorous            0.003272
creeper                0.003272
orchid                 0.002726
weed                   0.002726
euphorbia              0.002181
thrift                 0.001091
bush                   0.001091
carnation              0.001091
herbs                  0.001091
astilbe                0.001091
ber

In [707]:
# On regroupe les types en quelques grandes catégories pour permettre à l'utilisateur de choisir : 

fleurs = ['flower', 'bulb', 'orchid','begonia', 'dahlia', 'iridaceae', 'daisy', 'bergenia', 
          'coneflower', 'aster', 'thistle', 'thrift', 'carnation', 'astilbe', 'dianthus', 'delphinium', 'aquatic', 'carnivorous']
arbres = ["tree", "broadleaf evergreen", "needled evergreen", "bamboo"]
herbes = ['herb', 'herbs', 'grass', 'poales (grass-like)', 'weed', 'ornamental grass', 'rush or sedge', 'turfgrass', 'reed', 'reeds', 'fern']
arbustes = ['deciduous shrub', 'shrub', 'bush', 'palm or cycad']
potager = ['vegetable', 'fruit']
plantes_grimpantes = ['vine', 'creeper', 'creepers']
succulentes = ['cactus', 'euphorbia']       

In [708]:
# Faire dico pour lier type avec nouvelles catégories.

type_dict = {'fleurs': fleurs, 'arbres': arbres, 'herbes': herbes, "arbustes": arbustes, "potager": potager, "plantes_grimpantes": plantes_grimpantes, "succulentes": succulentes}
big_dict = {}
for key, value in type_dict.items():
    big_dict.update({plant:key for plant in value})

In [709]:
plants_selection.loc[:, 'type'] = plants_selection['type'].apply(lambda x: big_dict[x] if x in big_dict.keys() else x)

In [710]:
plants_selection['type'].value_counts()

type
arbres                509
herbes                444
fleurs                431
arbustes              364
potager                31
plantes_grimpantes     30
succulentes            25
Name: count, dtype: int64

In [711]:
plants_selection[plants_selection['type'].isnull()] # 5 valeurs manquantes

Unnamed: 0,common_name,scientific_name,type,soil,cycle,attracts,maintenance,sunlight,growth_rate,drought_tolerant,salt_tolerant,thorny,poisonous_to_humans,poisonous_to_pets,invasive,edible_fruit,medicinal,hardiness.min,hardiness.max
336,asparagus,['Asparagus officinalis'],,[],Herbaceous Perennial,[],low,['Full sun'],High,False,True,False,False,False,False,True,True,3.0,10
483,false indigo,"[""Baptisia 'Carolina Moonlight'""]",,[],Herbaceous Perennial,['Butterflies'],low,"['Full sun', 'part shade']",Low,True,False,False,False,False,False,False,True,4.0,9
627,rutabaga,['Brassica napus (Napobrassica Group)'],,[],Annual,[],low,"['full sun', 'part shade']",Low,False,True,False,False,False,False,False,False,2.0,11
1331,common foxglove,"[""Digitalis purpurea 'Sutton's Apricot'""]",,"['Humus rich', ' Well-drained']",Herbaceous Perennial,['Hummingbirds'],low,['Part sun/part shade'],High,False,False,False,True,True,False,False,True,4.0,8
1511,coneflower,"[""Echinacea 'Secret Lust'""]",,[],Herbaceous Perennial,"['Birds', ' Butterflies']",low,"['Full sun', 'part shade']",High,True,False,False,False,False,False,False,True,3.0,8


In [712]:
# si autres plantes du même nom : remplir avec la première valeur
for id in [336, 483, 1331, 1511]:
    plants_selection.loc[id, 'type'] = plants_selection.loc[(~plants_selection['type'].isna()) & (plants_selection['common_name']==plants_selection.loc[id, 'common_name']), 'type'].values[0]

# sinon on les corrige à la main
plants_selection.loc[627, 'type'] = "potager" # pas d'autres rutabaga

In [713]:
plants_selection['type'].isnull().sum()

np.int64(0)

## Colonne Soil

In [714]:
plants_selection['soil'].value_counts()

soil
[]                                                                  1346
['Well-drained']                                                     133
['Sandy Loamy Clay Rocky']                                            62
Low                                                                   46
['Humus rich', ' Well-drained']                                       38
Moderate                                                              35
['Acidic', ' Well-drained']                                           23
High                                                                  19
['Alkaline', ' Well-drained']                                         15
['Rocky ', ' gravelly ', ' dry', ' Well-drained']                     13
['Acidic', ' Bog', ' Humus rich']                                     12
['Acidic', ' Humus rich', ' Well-drained']                            12
['Bog', ' Humus rich']                                                12
['Sandy Loamy Rocky']                         

In [715]:
plants_selection = plants_selection.drop(columns=['soil'])  # drop pour l'instant, demande trop de nettoyage pour être exploitable

## Colonne Cycle

- perennials = plants that will come back and regrow year after year
- annuals = die off when temperatures get too cold and require you to plant new plants the following spring

In [716]:
plants_selection.loc[plants_selection['cycle'].isin(["Herbaceous Perennial", "Perennial."]), 'cycle'] = "Perennial" # regroupe en 2 catégories

In [717]:
plants_selection['cycle'].value_counts()

cycle
Perennial    1779
Annual         60
Name: count, dtype: int64

In [718]:
# créer une seule colonne : is_perennial -> True, False

plants_selection.loc[:, 'is_perennial'] = plants_selection['cycle'].apply(lambda x: True if x=="Perennial" else False)
plants_selection['is_perennial'].value_counts()

is_perennial
True     1779
False      60
Name: count, dtype: int64

In [719]:
plants_selection = plants_selection.drop(columns=['cycle'])

## Colonne Attracts

In [720]:
plants_selection['attracts'].value_counts()

attracts
[]                                            1300
['Butterflies']                                212
['Birds', ' Butterflies']                      157
['Birds']                                       71
['Hummingbirds', ' Butterflies']                66
['Hummingbirds']                                24
['Birds', ' Hummingbirds', ' Butterflies']       6
['Squirrels', ' Bees']                           2
['Squirrels', ' Birds']                          1
Name: count, dtype: int64

In [721]:
# la colonne "attracts" contient des listes au format string, on les retransforme en listes et on retire les espaces avant et après les mots contenus dans chaque liste (ex: ' Butterflies')
plants_selection.loc[:, 'attracts'] = plants_selection['attracts'].apply(eval)
plants_selection.loc[:, 'attracts'] = plants_selection['attracts'].apply(lambda x: [' '.join(s.split()) for s in x])

In [722]:
 # On crée 4 colonnes : attracts_bees, attracts_butterflies, attracts_birds, attracts_squirrels

def attracts(x, animals: List[str]):
    for attracted_being in x:
        if attracted_being in animals :
            return True
    return False

plants_selection.loc[:,'attracts_birds'] = plants_selection['attracts'].apply(lambda x: attracts(x, ['Birds', 'Hummingbirds']))
plants_selection.loc[:,'attracts_butterflies'] = plants_selection['attracts'].apply(lambda x: attracts(x, ['Butterflies']))
# plants_selection.loc[:,'attracts_bees'] = plants_selection['attracts'].apply(lambda x: attracts(x, ['Bees'])) que 2, colonne inutile
# plants_selection.loc[:,'attracts_squirrels'] = plants_selection['attracts'].apply(lambda x: attracts(x, ['Squirrels'])) que 3, colonne inutile

In [723]:
plants_selection['attracts_birds'].value_counts()

attracts_birds
False    1514
True      325
Name: count, dtype: int64

In [724]:
# drop attracts
plants_selection = plants_selection.drop(columns=['attracts'])

## Colonne sunlight

- Soleil (full_sun) : plus de 6h d'ensoleillement par jour 
- Mi-ombre (part_shade) : de 4 à 6h d'ensoleillement direct ou plus de 8h d'ensoleillement indirect (ex: filtré par des arbres).
- Ombre (full_shade) : moins de 4h d'ensoleillement direct ou moins de 8h d'ensoleillement indirect

In [725]:
plants_selection ['sunlight'].value_counts() 

sunlight
['Full sun']                                                                                   467
['Full sun', 'part shade']                                                                     429
['full sun', 'part shade']                                                                     187
['Part shade', 'full shade']                                                                   176
['full sun']                                                                                    97
                                                                                              ... 
['Shade']                                                                                        1
['Deep shade', 'Filtered shade', 'Full sun only if soil kept moist', 'Part sun/part shade']      1
['Partial sun Shade']                                                                            1
['Full sun only if soil kept moist', ' Part sun/part shade']                                     1
[

In [726]:
plants_selection.loc[:, 'sunlight'] = plants_selection['sunlight'].apply(eval)
plants_selection.loc[:, 'sunlight'] = plants_selection['sunlight'].apply(lambda x: [' '.join(s.lower().split()) for s in x])

In [727]:
# on regroupe en 3 catégories : full_sun, part_shade et full_shade
full_sun = ['full sun', 'sun', 'full sun partial sun']
full_shade = ["deep shade", "full shade", "shade"]

def categorize_sunlight(x, full_sun=full_sun, full_shade=full_shade):
    for sunlight_info in x:
        if sunlight_info in full_shade:
            return "full_shade"
    for sunlight_info in x:
        if sunlight_info in full_sun:
            return "full_sun"
    return "part_shade"

plants_selection.loc[:,'sunlight'] = plants_selection['sunlight'].apply(lambda x: categorize_sunlight(x))

In [728]:
plants_selection['sunlight'].value_counts()

sunlight
full_sun      1316
part_shade     283
full_shade     240
Name: count, dtype: int64

## Colonne growth_rate

In [729]:
plants_selection['growth_rate'].value_counts() # on pourrait corriger à la main ou voir si scrapping possible

growth_rate
Low         1090
High         489
Moderate     160
FALSE         62
TRUE          38
Name: count, dtype: int64

In [730]:
# drop pour l'instant:
plants_selection = plants_selection.drop(columns=['growth_rate'])

## Colonnes Hardiness Min/Max

zones de rusticité définies selon les pires température en hiver

In [731]:
plants_selection = plants_selection.rename(columns={"hardiness.min": "hardiness_min", "hardiness.max": "hardiness_max"})

In [732]:
plants_selection["hardiness_min"].value_counts().sort_index()

hardiness_min
2.0      66
3.0     323
4.0     337
5.0     298
6.0     382
7.0     166
8.0     102
9.0      67
10.0     85
11.0     10
Name: count, dtype: int64

In [733]:
# Certaines lignes contiennent des liens vers une carte des zones de rusticité -> on peut les remplacer avec hardiness_min
plants_selection.loc[~plants_selection['hardiness_max'].isin(["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"]), 'hardiness_max'].value_counts()

hardiness_max
https://perenual.com/api/hardiness-map?species_id=123&size=og&key=sk-HWmO67d1db5d934599114    1
https://perenual.com/api/hardiness-map?species_id=128&size=og&key=sk-HWmO67d1db5d934599114    1
https://perenual.com/api/hardiness-map?species_id=129&size=og&key=sk-HWmO67d1db5d934599114    1
https://perenual.com/api/hardiness-map?species_id=131&size=og&key=sk-HWmO67d1db5d934599114    1
https://perenual.com/api/hardiness-map?species_id=132&size=og&key=sk-HWmO67d1db5d934599114    1
                                                                                             ..
https://perenual.com/api/hardiness-map?species_id=340&size=og&key=sk-HWmO67d1db5d934599114    1
https://perenual.com/api/hardiness-map?species_id=341&size=og&key=sk-HWmO67d1db5d934599114    1
https://perenual.com/api/hardiness-map?species_id=342&size=og&key=sk-HWmO67d1db5d934599114    1
https://perenual.com/api/hardiness-map?species_id=343&size=og&key=sk-HWmO67d1db5d934599114    1
https://perenual.com/api/h

In [734]:
plants_selection.loc[~plants_selection['hardiness_max'].isin(["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"]), 'hardiness_min'].value_counts()

hardiness_min
6.0    75
7.0    17
8.0     6
9.0     2
Name: count, dtype: int64

In [735]:
mask_islink = ~plants_selection['hardiness_max'].isin(["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"])
plants_selection.loc[mask_islink, 'hardiness_max'] = plants_selection.loc[mask_islink, 'hardiness_min']

In [736]:
plants_selection.loc[:, 'hardiness_max'] = pd.to_numeric(plants_selection['hardiness_max'])

In [737]:
plants_selection["hardiness_max"].value_counts().sort_index()

hardiness_max
2.0       1
5.0       1
6.0     216
7.0     170
8.0     594
9.0     438
10.0    213
11.0    145
12.0     58
Name: count, dtype: int64

In [738]:
# 3 plantes pour lesquelles la zone de rusticité n'est pas renseignée -> recherche sur google et rempli à la main
plants_selection.loc[plants_selection['hardiness_max'].isna() | plants_selection['hardiness_min'].isna()]

Unnamed: 0,common_name,scientific_name,type,maintenance,sunlight,drought_tolerant,salt_tolerant,thorny,poisonous_to_humans,poisonous_to_pets,invasive,edible_fruit,medicinal,hardiness_min,hardiness_max,is_perennial,attracts_birds,attracts_butterflies
498,Malabar spinach,['Basella alba'],plantes_grimpantes,moderate,full_sun,True,True,False,False,False,False,False,True,,,True,False,False
499,Ceylon spinach,"[""Basella rubra 'Red Stem'""]",plantes_grimpantes,moderate,full_sun,False,True,False,False,False,False,False,True,,,True,False,False
1220,carnation,['Dianthus AMERICAN PIE GEORGIA PEACH PIE'],fleurs,low,full_sun,False,True,False,False,False,False,False,False,,,True,False,False


In [739]:
# Malabar spinach
plants_selection.loc[498, 'hardiness_min'] = 7.0
plants_selection.loc[498, 'hardiness_max'] = 11.0

# Ceylon spinach
plants_selection.loc[499, 'hardiness_min'] = 9.0
plants_selection.loc[499, 'hardiness_max'] = 11.0

# carnation aka Dianthus American Pie 'Georgia Peach Pie'
plants_selection.loc[1220, 'hardiness_min'] = 5.0
plants_selection.loc[1220, 'hardiness_max'] = 9.0

In [740]:
plants_selection.isnull().sum()

common_name             0
scientific_name         0
type                    0
maintenance             0
sunlight                0
drought_tolerant        0
salt_tolerant           0
thorny                  0
poisonous_to_humans     0
poisonous_to_pets       0
invasive                0
edible_fruit            0
medicinal               0
hardiness_min           0
hardiness_max           0
is_perennial            0
attracts_birds          0
attracts_butterflies    0
dtype: int64

## Colonne Poisonous to pets/humans

In [741]:
# Dans algo de recommendation on supprimera directement toutes les plantes toxiques pour humains et animaux de compagnie. 

In [742]:
plants_selection['poisonous_to_humans'].value_counts(normalize=True)

poisonous_to_humans
False    0.960848
True     0.039152
Name: proportion, dtype: float64

In [743]:
plants_selection['poisonous_to_pets'].value_counts()

poisonous_to_pets
FALSE                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            1637
TRUE                                                                                                                                                                                                                                                                                                                                            

In [744]:
def clean_poisonous_to_pets(x):
    if x == "TRUE":
        return True
    elif x=="FALSE":
        return False
    else:
        return True # renvoie True par défaut (pour l'instant, on pourra faire une recherche et les remplir à la main plus tard), vaut mieux dire au gens que la plante est toxique, que l'inverse et que leur chien s'empoisonne.
    
plants_selection.loc[:, 'poisonous_to_pets'] = plants_selection['poisonous_to_pets'].apply(lambda x: clean_poisonous_to_pets(x))

In [745]:
plants_selection['poisonous_to_pets'].value_counts(normalize=True) 

poisonous_to_pets
False    0.890158
True     0.109842
Name: proportion, dtype: float64

## Colonne "Invasive"

invasive plants : non-native plant that grows where you don't want it to and behaves in a way that makes it hard to control

rmq : ici envahissante du point de vue d'un jardinier américain, pas forcément le cas pour la France -> trouver une liste pour la France

In [746]:
# Dans algo de recommandation, on va exclure les plantes envahissantes par défaut
plants_selection['invasive'].value_counts()

invasive
False    1633
True      206
Name: count, dtype: int64

## Colonnes résistance sécheresse/sel

In [747]:
plants_selection['drought_tolerant'].value_counts()

drought_tolerant
False    974
True     865
Name: count, dtype: int64

In [748]:
plants_selection['salt_tolerant'].value_counts() # -> bord de mer

salt_tolerant
False    1022
True      817
Name: count, dtype: int64

## Colonnes sur lesquelles l'utilisateur pourra filtrer:

In [749]:
plants_selection.columns

Index(['common_name', 'scientific_name', 'type', 'maintenance', 'sunlight',
       'drought_tolerant', 'salt_tolerant', 'thorny', 'poisonous_to_humans',
       'poisonous_to_pets', 'invasive', 'edible_fruit', 'medicinal',
       'hardiness_min', 'hardiness_max', 'is_perennial', 'attracts_birds',
       'attracts_butterflies'],
      dtype='object')

In [750]:
plants_selection['medicinal'].value_counts() # Filtrer sur les plantes médicinale ou pas.

medicinal
False    1199
True      640
Name: count, dtype: int64

In [751]:
plants_selection['thorny'].value_counts() # Exclure les plantes à épines ou pas.

thorny
False    1639
True      200
Name: count, dtype: int64

In [752]:
def clean_edible_fruits(x):
    if x == "TRUE":
        return True
    elif x=="FALSE":
        return False
    else:
        return False # renvoie False par défaut, vaut mieux dire au gens que c non comestible, que l'inverse et qu'ils s'empoisonnent'.
    
plants_selection.loc[:, 'edible_fruit'] = plants_selection['edible_fruit'].apply(lambda x: clean_edible_fruits(x))

In [753]:
plants_selection['edible_fruit'].value_counts() # Filtrer sur plantes avec des fruits comestibles ou pas.

edible_fruit
False    1719
True      120
Name: count, dtype: int64

## Jeu de données propre

In [754]:
plants_selection.columns

Index(['common_name', 'scientific_name', 'type', 'maintenance', 'sunlight',
       'drought_tolerant', 'salt_tolerant', 'thorny', 'poisonous_to_humans',
       'poisonous_to_pets', 'invasive', 'edible_fruit', 'medicinal',
       'hardiness_min', 'hardiness_max', 'is_perennial', 'attracts_birds',
       'attracts_butterflies'],
      dtype='object')

In [755]:
plants_selection.isnull().sum()

common_name             0
scientific_name         0
type                    0
maintenance             0
sunlight                0
drought_tolerant        0
salt_tolerant           0
thorny                  0
poisonous_to_humans     0
poisonous_to_pets       0
invasive                0
edible_fruit            0
medicinal               0
hardiness_min           0
hardiness_max           0
is_perennial            0
attracts_birds          0
attracts_butterflies    0
dtype: int64

In [756]:
plants_selection.shape

(1839, 18)

In [757]:
plants_selection.to_csv('../data/processed/plants_clean_dataset.csv', index=False)