In [None]:
import pandas as pd

**Loading the data**

In [None]:
tree = pd.read_csv("/kaggle/input/2015-street-tree-census/2015_Street_Tree_Census_-_Tree_Data.csv")

**Extending the limit to see all columns**

In [None]:
pd.set_option('display.max_columns', None)

**First 5 entries**

In [None]:
tree.head()

**Columns present in the dataset**

In [None]:
tree.columns

**Filtering out the necessary columns for data exploration and data cleaning**

In [None]:
tree_subset = tree[['tree_id', 'tree_dbh', 'stump_diam', 'curb_loc', 'status', 'health', 'spc_latin',
                    'steward', 'sidewalk', 'problems', 'root_stone', 'root_grate', 'root_other', 
                    'trunk_wire', 'trnk_light','trnk_other', 'brch_light', 'brch_shoe', 'brch_other']]
tree_subset

**Calculating the null values present**

In [None]:
tree_subset.isna().sum()

**Analyzing the descriptive stats**

In [None]:
tree_subset.describe()

**A look at how data in organised**

In [None]:
tree_subset[tree_subset['health'].isna()]

**Distribution of values in column *health***

In [None]:
tree_subset['health'].value_counts()

**Plotting the descriptive stats**

In [None]:
tree_subset.hist(bins=60, figsize=(20, 10))

**Filtering out the trees with diameter greater than 50**

In [None]:
big_trees = tree_subset[tree_subset['tree_dbh'] > 50]
big_trees

**Spotting out the outliers using scatter plot**

In [None]:
big_trees[['tree_id', 'tree_dbh']].plot(kind='scatter', x='tree_id', y='tree_dbh', figsize=(20, 10))

**Visualizing the distribution of latin names of the trees**

In [None]:
pd.DataFrame(tree_subset['spc_latin'].value_counts()).plot(kind='bar', figsize=(20, 10))

**Distribution of values in column *steward***

In [None]:
tree_subset['steward'].value_counts()

**Distribution of values in column *sidewalk***

In [None]:
tree_subset['sidewalk'].value_counts()

**Distribution of values in column *curb_loc***

In [None]:
tree_subset['curb_loc'].value_counts()

**Filtering out the trees where status = stump**

In [None]:
stumps = tree_subset[tree_subset['status'] == 'Stump']
stumps

**Filtering out the trees where status = dead**

In [None]:
dead = tree_subset[tree_subset['status'] == 'Dead']
dead

**Filtering out the necessary columns to analyze the problems**

In [None]:
tree_problems = tree_subset[['root_stone',
       'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other',
       'brch_light', 'brch_shoe', 'brch_other']]
tree_problems

**Distribution of values in *tree_problems***

In [None]:
tree_problems.apply(pd.Series.value_counts)

**Mask to extract the values where status is either stump or dead**

In [None]:
mask = ((tree_subset['status'] == 'Stump') | (tree_subset['status'] == 'Dead'))

**Filling the null values with *Not Applicable* where conditions meet *mask***

In [None]:
tree_subset.loc[mask] = tree_subset.loc[mask].fillna('Not Applicable')

**Double checking the changes made**

In [None]:
tree_subset[tree_subset['status'] == 'Stump']

**Re-evaluating the null values present**

In [None]:
tree_subset.isna().sum()

**Filling the null values with the appropriate values to the corresponding columns**

In [None]:
tree_subset['problems'].fillna('None', inplace=True)
tree_subset['health'].fillna('Good', inplace=True)
tree_subset['spc_latin'].fillna('No Observation', inplace=True)
tree_subset['sidewalk'].fillna('NoDamage', inplace=True)
tree_subset['steward'].fillna('1or2', inplace=True)

**Re-evaluating the null values present**

In [None]:
tree_subset.isna().sum()

**Filtering out the trees with diameter > 60 and stump diameter > 60**

In [None]:
big_tress = tree_subset[(tree_subset['tree_dbh'] > 60) | (tree_subset['stump_diam'] > 60)]
big_trees

**Filtering out the trees where diameter <= 60 and stump diameter <= 60**

In [None]:
tree_subset = tree_subset[(tree_subset['tree_dbh'] <= 60) & (tree_subset['stump_diam'] <= 60)]
tree_subset

**Filtering out the trees with status = alive and with status = dead or stump**

In [None]:
tree_subset_alive = tree_subset[tree_subset['status'] == 'Alive']
tree_subset_dead_or_stump = tree_subset[(tree_subset['status'] == 'Dead') |
                                        (tree_subset['status'] == 'Stump')]

**Analyzing the descriptive stats according to trees' latin names**

In [None]:
tree_subset_alive.groupby('spc_latin')['tree_dbh'].describe()

**Filtering out the necessary columns**

In [None]:
stats_alive = tree_subset_alive.groupby('spc_latin')['tree_dbh'].describe().reset_index()[['spc_latin', '25%', '75%']]
stats_alive

**Merging *tree_subset_alive* with *stats_alive***

In [None]:
tree_subset_alive = tree_subset_alive.merge(stats_alive, on='spc_latin', how='left')
tree_subset_alive

**Replacing the *tree diameter* with *25%* value where *diameter < 25%* and with *75%* value where *diameter > 75%***

In [None]:
mask = tree_subset_alive['tree_dbh'] < tree_subset_alive['25%']
tree_subset_alive.loc[mask, 'tree_dbh'] = tree_subset_alive['25%']

mask = tree_subset_alive['tree_dbh'] > tree_subset_alive['75%']
tree_subset_alive.loc[mask, 'tree_dbh'] = tree_subset_alive['75%']

**Double checking the changes made**

In [None]:
tree_subset_alive