In [42]:
import pandas as pd
import numpy as np
import re

In [43]:
tree_inv = pd.read_csv('Tree_Inventory_20240929.csv')

tree_inv.shape

(62274, 6)

In [44]:
tree_inv.head()

Unnamed: 0,GEOMETRY,SPECIES,DIAMETER,LATITUDE,LONGTITUDE,New Georeferenced Column
0,POINT (-97.73398904092146 30.25239671647407),Live Oak,20.0,30.252397,-97.733989,POINT (-97.7339890409 30.2523967165)
1,POINT (-97.77949770772025 30.20704479399738),Ashe Juniper,8.0,30.20705,-97.779505,POINT (-97.7795053399 30.2070499964)
2,POINT (-97.73408544155564 30.252389392695715),Live Oak,31.0,30.252389,-97.734085,POINT (-97.7340854416 30.2523893927)
3,POINT (-97.73392463841283 30.252384583450418),Live Oak,21.0,30.252385,-97.733925,POINT (-97.7339246384 30.2523845835)
4,POINT (-97.73385944263053 30.252313660255723),Live Oak,20.0,30.252314,-97.733859,POINT (-97.7338594426 30.2523136603)


In [45]:
tree_inv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62274 entries, 0 to 62273
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   GEOMETRY                  62274 non-null  object 
 1   SPECIES                   62210 non-null  object 
 2   DIAMETER                  62274 non-null  float64
 3   LATITUDE                  62274 non-null  float64
 4   LONGTITUDE                62274 non-null  float64
 5   New Georeferenced Column  62274 non-null  object 
dtypes: float64(3), object(3)
memory usage: 2.9+ MB


In [46]:
tree_inv['SPECIES'].fillna('Unknown', inplace=True)
tree_inv.rename(columns={'LONGTITUDE': 'LONGITUDE'}, inplace=True)

def get_long_lat(geometry):
    pattern = r'POINT \((-?\d+\.\d+) (-?\d+\.\d+)\)'
    grouping = re.fullmatch(pattern, geometry)
    return pd.Series([float(grouping.group(1)), float(grouping.group(2))])

tree_inv[['LONGITUDE', 'LATITUDE']] = tree_inv['GEOMETRY'].apply(get_long_lat)

tree_inv = tree_inv[['SPECIES', 'DIAMETER', 'LONGITUDE', 'LATITUDE']]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tree_inv['SPECIES'].fillna('Unknown', inplace=True)


In [47]:
tree_inv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62274 entries, 0 to 62273
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SPECIES    62274 non-null  object 
 1   DIAMETER   62274 non-null  float64
 2   LONGITUDE  62274 non-null  float64
 3   LATITUDE   62274 non-null  float64
dtypes: float64(3), object(1)
memory usage: 1.9+ MB


In [48]:
tree_inv['SPECIES'] = tree_inv['SPECIES'].apply(str.lower)

with open('repeat_species.txt', 'r') as f:
    new_name_dict = {}
    for l in f.readlines():
        split1 = l.strip().split('=')
        new_name = split1[0]
        # print(new_name)
        split_old_names = split1[1].split(';')
        # print(split_old_names)
        for on in split_old_names:
            new_name_dict[on] = new_name

In [50]:
# For new data set
tree_inv['SPECIES'] = tree_inv['SPECIES'].apply(lambda x: new_name_dict.get(x, x).title())
tree_inv['SPECIES'].replace('Unknown', np.nan)
species_to_be_removed = ['Vacant Site', 'Stump', 'Tbd', 'Tbd Shade', 'Tbd Ornamental', 'Tbd - Shade', 'Tbd - Ornamental']
tree_inv = tree_inv[~tree_inv['SPECIES'].isin(species_to_be_removed)]
tree_inv.to_csv('tree_inventory.csv', index=False)