# Cleaning Practice
Let's first practice handling missing values and duplicate data using the `cancer_data_means.csv` file.

In [2]:
# import pandas and load cancer data

# check which columns have missing values

import pandas as pd

df = pd.read_csv('cancer_data_means.csv')

cols = df.columns[df.isna().any()].tolist()

print(cols)

['texture_mean', 'smoothness_mean', 'symmetry_mean']


In [3]:
# use the mean to fill in missing values
mean = df[cols].mean()

df.loc[:, cols] = df[cols].fillna(mean)
# confirm your correction 
df[cols].fillna(mean)

Unnamed: 0,texture_mean,smoothness_mean,symmetry_mean
0,19.293431,0.118400,0.2419
1,17.770000,0.084740,0.1812
2,21.250000,0.109600,0.2069
3,20.380000,0.096087,0.2597
4,14.340000,0.100300,0.1809
...,...,...,...
564,22.390000,0.111000,0.1726
565,28.250000,0.097800,0.1752
566,28.080000,0.084550,0.1590
567,29.330000,0.117800,0.2397


In [32]:
# how many duplicates are there ?
sum(df.duplicated())

5

In [33]:
# drop duplicates
df.drop_duplicates(inplace = True)

In [34]:
# confirm correction by rechecking for duplicates in the data
sum(df.duplicated())

0

## Renaming Columns
Since we also previously changed our dataset to only include means of tumor features, the "_mean" at the end of each feature seems unnecessary. It just takes extra time to type in our analysis later. Rename the columns of the dataframe to remove "_mean".

In [53]:
# rename the columns of the dataframe (remove _mean from the name of each column if found)
df.rename(columns = lambda x: x.replace('_mean', ''), inplace=True)
df.info

<bound method DataFrame.info of            id diagnosis  radius  texture  perimeter    area  smoothness  \
0      842302         M   17.99      NaN     122.80  1001.0     0.11840   
1      842517         M   20.57    17.77     132.90  1326.0     0.08474   
2    84300903         M   19.69    21.25     130.00  1203.0     0.10960   
3    84348301         M   11.42    20.38      77.58   386.1         NaN   
4    84358402         M   20.29    14.34     135.10  1297.0     0.10030   
..        ...       ...     ...      ...        ...     ...         ...   
564    926424         M   21.56    22.39     142.00  1479.0     0.11100   
565    926682         M   20.13    28.25     131.20  1261.0     0.09780   
566    926954         M   16.60    28.08     108.30   858.1     0.08455   
567    927241         M   20.60    29.33     140.10  1265.0     0.11780   
568     92751         B    7.76    24.54      47.92   181.0     0.05263   

     compactness  concavity  concave_points  symmetry  fractal_dime

In [6]:
# display first few rows of the dataframe to confirm changes
df.head(3)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean
0,842302,M,17.99,19.293431,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999


In [7]:
# save this for later as a csv file named "cancer_data_edited.csv" and set indexing to false (why ??)
df.to_csv('cancer_data_edited.csv', index = False)

# to avoid printing the index to csv