# Cleaning Practice
Let's first practice handling missing values and duplicate data using the `cancer_data_means.csv` file.

In [1]:
#import libraries and assign cancer_data_means.csv to a variable 
import pandas as pd
df_cancer = pd.read_csv('cancer_data_means.csv')


In [2]:
# check which columns have missing values
missing_values = df_cancer.isnull().sum()

texture_mean       21
smoothness_mean    48
symmetry_mean      65
dtype: int64


In [3]:
# confirm your correction
print(missing_values[missing_values > 0])

texture_mean       21
smoothness_mean    48
symmetry_mean      65
dtype: int64


In [4]:
# how many duplicates are there ?
num_duplicates = df_cancer.duplicated().sum()
print(f"Number of duplicatees : {num_duplicates}")

Number of duplicatees : 5


In [5]:
# drop duplicates
df_cancer = df_cancer.drop_duplicates()

In [6]:
# confirm correction by rechecking for duplicates in the data
num_duplicates_after = df_cancer.duplicated().sum()
print(f"Number of duplicates in data after dropping: {num_duplicates_after}")

Number of duplicates in data after dropping: 0


## Renaming Columns
Since we also previously changed our dataset to only include means of tumor features, the "_mean" at the end of each feature seems unnecessary. It just takes extra time to type in our analysis later. Rename the columns of the dataframe to remove "_mean".

In [7]:
# rename the columns of the dataframe (remove _mean from the name of each column if found)
df_cancer.columns = [col.replace('_mean', '') for col in df_cancer.columns]

In [8]:
# display first few rows of the dataframe to confirm changes
print(df_cancer.head())

         id diagnosis  radius  texture  perimeter    area  smoothness  \
0    842302         M   17.99      NaN     122.80  1001.0     0.11840   
1    842517         M   20.57    17.77     132.90  1326.0     0.08474   
2  84300903         M   19.69    21.25     130.00  1203.0     0.10960   
3  84348301         M   11.42    20.38      77.58   386.1         NaN   
4  84358402         M   20.29    14.34     135.10  1297.0     0.10030   

   compactness  concavity  concave_points  symmetry  fractal_dimension  
0      0.27760     0.3001         0.14710    0.2419            0.07871  
1      0.07864     0.0869         0.07017    0.1812            0.05667  
2      0.15990     0.1974         0.12790    0.2069            0.05999  
3      0.28390     0.2414         0.10520    0.2597            0.09744  
4      0.13280     0.1980         0.10430    0.1809            0.05883  


In [9]:
# save this for later as a csv file named "cancer_data_edited.csv" and set indexing to false (why ??)
df_cancer.to_csv('cancer_data_edited.csv', index=False)