# Task 1

Create a function to rename DataFrame columns using dict_names.  
Summarize cleaned data and annotate observations (e.g., outliers, variability). 

**Inputs**: df, dict_names  
**Outputs**: renamed_df, summary notes 

In [2]:
import pandas as pd

In [3]:
# Create a function to rename the DataFrame columns
def rename_col (df, dict_names):
    return df.rename(columns=dict_names)

In [4]:
# Import the dataframe
df = pd.read_csv('breast_cancer.csv')
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [5]:
# Create the dictionary dict_names to later use it in the rename_col function
dict_names = dict(zip(df.columns, df.columns.str.replace(" ", "_")))

renamed_df = rename_col(df, dict_names)
renamed_df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [6]:
# See if any of the columns has any NAN values
renamed_df.isna().sum()

mean_radius                0
mean_texture               0
mean_perimeter             0
mean_area                  0
mean_smoothness            0
mean_compactness           0
mean_concavity             0
mean_concave_points        0
mean_symmetry              0
mean_fractal_dimension     0
radius_error               0
texture_error              0
perimeter_error            0
area_error                 0
smoothness_error           0
compactness_error          0
concavity_error            0
concave_points_error       0
symmetry_error             0
fractal_dimension_error    0
worst_radius               0
worst_texture              0
worst_perimeter            0
worst_area                 0
worst_smoothness           0
worst_compactness          0
worst_concavity            0
worst_concave_points       0
worst_symmetry             0
worst_fractal_dimension    0
diagnosis                  0
dtype: int64

In [7]:
# Create a function to remove outliers, using the IQR
def remove_outliers_iqr(df, cols=None, k=1.5):
    if cols is None:
        cols = df.select_dtypes(include="number").columns
    
    df_clean = df.copy()
    for c in cols:
        q1 = df_clean[c].quantile(0.25)
        q3 = df_clean[c].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - k * iqr
        upper = q3 + k * iqr
        df_clean = df_clean[(df_clean[c] >= lower) & (df_clean[c] <= upper)]
    return df_clean

renamed_df = remove_outliers_iqr(renamed_df)

In [8]:
# Use the pandas function describe to get a quick calculation of the descriptive statistics of each column of the clean dataframe
renamed_df.describe(include='all')

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
count,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0
mean,12.505298,17.54298,80.199412,488.798039,0.090762,0.071646,0.036516,0.023986,0.170145,0.060838,...,23.34051,89.459451,592.941961,0.123973,0.16802,0.147706,0.074442,0.270538,0.076289,1.0
std,1.615598,3.754894,10.733314,124.840357,0.011387,0.024261,0.023203,0.013444,0.019527,0.004421,...,5.296252,12.22215,152.043038,0.017608,0.068972,0.091017,0.03185,0.037887,0.009284,0.0
min,8.196,9.71,51.71,201.9,0.06251,0.01938,0.0,0.0,0.1215,0.05185,...,12.02,57.26,242.2,0.08125,0.03432,0.0,0.0,0.1783,0.05521,1.0
25%,11.465,14.975,73.625,403.2,0.082845,0.05288,0.01977,0.01486,0.1555,0.05737,...,19.63,82.685,495.15,0.1109,0.11475,0.08046,0.055595,0.24335,0.06937,1.0
50%,12.54,17.18,80.45,481.9,0.08946,0.06945,0.03346,0.02257,0.1687,0.06048,...,22.74,88.52,580.9,0.1234,0.165,0.1384,0.07431,0.2685,0.07623,1.0
75%,13.64,19.185,87.76,571.45,0.098795,0.085255,0.04919,0.030435,0.18455,0.063725,...,26.435,97.93,688.75,0.1353,0.21185,0.1936,0.093505,0.29825,0.081985,1.0
max,16.84,29.43,108.4,880.2,0.1237,0.1661,0.1191,0.07064,0.2238,0.07633,...,40.54,120.3,1032.0,0.1733,0.3627,0.4341,0.1599,0.3849,0.1017,1.0
