# Cleaning Practice
Let's first practice handling missing values and duplicate data using the `cancer_data_means.csv` file.

In [7]:
#import libraries and assign cancer_data_means.csv to a variable 
import pandas as pd
cdm = pd.read_csv('cancer_data_means.csv')
cdm

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean
0,842302,M,17.99,,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,,0.28390,0.24140,0.10520,0.2597,0.09744
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883
...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016


In [8]:
# check which columns have missing values
cdm.isna().any()

id                        False
diagnosis                 False
radius_mean               False
texture_mean               True
perimeter_mean            False
area_mean                 False
smoothness_mean            True
compactness_mean          False
concavity_mean            False
concave_points_mean       False
symmetry_mean              True
fractal_dimension_mean    False
dtype: bool

In [9]:
# confirm your correction
cdm = cdm.dropna(axis = 1)
cdm

Unnamed: 0,id,diagnosis,radius_mean,perimeter_mean,area_mean,compactness_mean,concavity_mean,concave_points_mean,fractal_dimension_mean
0,842302,M,17.99,122.80,1001.0,0.27760,0.30010,0.14710,0.07871
1,842517,M,20.57,132.90,1326.0,0.07864,0.08690,0.07017,0.05667
2,84300903,M,19.69,130.00,1203.0,0.15990,0.19740,0.12790,0.05999
3,84348301,M,11.42,77.58,386.1,0.28390,0.24140,0.10520,0.09744
4,84358402,M,20.29,135.10,1297.0,0.13280,0.19800,0.10430,0.05883
...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,142.00,1479.0,0.11590,0.24390,0.13890,0.05623
565,926682,M,20.13,131.20,1261.0,0.10340,0.14400,0.09791,0.05533
566,926954,M,16.60,108.30,858.1,0.10230,0.09251,0.05302,0.05648
567,927241,M,20.60,140.10,1265.0,0.27700,0.35140,0.15200,0.07016


In [10]:
# how many duplicates are there ?
len(cdm[cdm.duplicated()])

7

In [11]:
# drop duplicates
cdm = cdm.drop_duplicates()
cdm

Unnamed: 0,id,diagnosis,radius_mean,perimeter_mean,area_mean,compactness_mean,concavity_mean,concave_points_mean,fractal_dimension_mean
0,842302,M,17.99,122.80,1001.0,0.27760,0.30010,0.14710,0.07871
1,842517,M,20.57,132.90,1326.0,0.07864,0.08690,0.07017,0.05667
2,84300903,M,19.69,130.00,1203.0,0.15990,0.19740,0.12790,0.05999
3,84348301,M,11.42,77.58,386.1,0.28390,0.24140,0.10520,0.09744
4,84358402,M,20.29,135.10,1297.0,0.13280,0.19800,0.10430,0.05883
...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,142.00,1479.0,0.11590,0.24390,0.13890,0.05623
565,926682,M,20.13,131.20,1261.0,0.10340,0.14400,0.09791,0.05533
566,926954,M,16.60,108.30,858.1,0.10230,0.09251,0.05302,0.05648
567,927241,M,20.60,140.10,1265.0,0.27700,0.35140,0.15200,0.07016


In [12]:
# confirm correction by rechecking for duplicates in the data
len(cdm[cdm.duplicated()])

0

## Renaming Columns
Since we also previously changed our dataset to only include means of tumor features, the "_mean" at the end of each feature seems unnecessary. It just takes extra time to type in our analysis later. Rename the columns of the dataframe to remove "_mean".

In [14]:
# rename the columns of the dataframe (remove _mean from the name of each column if found)
cdm.columns = cdm.columns.str.replace('_mean','')
cdm

Unnamed: 0,id,diagnosis,radius,perimeter,area,compactness,concavity,concave_points,fractal_dimension
0,842302,M,17.99,122.80,1001.0,0.27760,0.30010,0.14710,0.07871
1,842517,M,20.57,132.90,1326.0,0.07864,0.08690,0.07017,0.05667
2,84300903,M,19.69,130.00,1203.0,0.15990,0.19740,0.12790,0.05999
3,84348301,M,11.42,77.58,386.1,0.28390,0.24140,0.10520,0.09744
4,84358402,M,20.29,135.10,1297.0,0.13280,0.19800,0.10430,0.05883
...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,142.00,1479.0,0.11590,0.24390,0.13890,0.05623
565,926682,M,20.13,131.20,1261.0,0.10340,0.14400,0.09791,0.05533
566,926954,M,16.60,108.30,858.1,0.10230,0.09251,0.05302,0.05648
567,927241,M,20.60,140.10,1265.0,0.27700,0.35140,0.15200,0.07016


In [15]:
# display first few rows of the dataframe to confirm changes
cdm.head(10)

Unnamed: 0,id,diagnosis,radius,perimeter,area,compactness,concavity,concave_points,fractal_dimension
0,842302,M,17.99,122.8,1001.0,0.2776,0.3001,0.1471,0.07871
1,842517,M,20.57,132.9,1326.0,0.07864,0.0869,0.07017,0.05667
2,84300903,M,19.69,130.0,1203.0,0.1599,0.1974,0.1279,0.05999
3,84348301,M,11.42,77.58,386.1,0.2839,0.2414,0.1052,0.09744
4,84358402,M,20.29,135.1,1297.0,0.1328,0.198,0.1043,0.05883
5,843786,M,12.45,82.57,477.1,0.17,0.1578,0.08089,0.07613
6,844359,M,18.25,119.6,1040.0,0.109,0.1127,0.074,0.05742
7,84458202,M,13.71,90.2,577.9,0.1645,0.09366,0.05985,0.07451
8,844981,M,13.0,87.5,519.8,0.1932,0.1859,0.09353,0.07389
9,84501001,M,12.46,83.97,475.9,0.2396,0.2273,0.08543,0.08243


In [9]:
# save this for later as a csv file named "cancer_data_edited.csv" and set indexing to false (why ??)
cdm.to_csv('cancer_data_edited.csv',index =False)
# why ? because when they read from the new one it should create new index 