In [37]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import io
import requests

# so that we can see all the columns
pd.set_option('display.max_columns', None) 

# how to read a csv file from a github account
url_name = 'https://raw.githubusercontent.com/akmand/datasets/master/breast_cancer_wisconsin.csv'
url_content = requests.get(url_name, verify=False).content
df = pd.read_csv(io.StringIO(url_content.decode('utf-8')))

In [38]:
df.shape

(569, 31)

In [39]:
df.columns.to_list()

['mean_radius',
 'mean_texture',
 'mean_perimeter',
 'mean_area',
 'mean_smoothness',
 'mean_compactness',
 'mean_concavity',
 'mean_concave_points',
 'mean_symmetry',
 'mean_fractal_dimension',
 'radius_error',
 'texture_error',
 'perimeter_error',
 'area_error',
 'smoothness_error',
 'compactness_error',
 'concavity_error',
 'concave_points_error',
 'symmetry_error',
 'fractal_dimension_error',
 'worst_radius',
 'worst_texture',
 'worst_perimeter',
 'worst_area',
 'worst_smoothness',
 'worst_compactness',
 'worst_concavity',
 'worst_concave_points',
 'worst_symmetry',
 'worst_fractal_dimension',
 'diagnosis']

In [40]:
df.iloc[:, -5:].sample(n=10, random_state=8)

Unnamed: 0,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
325,0.102,0.05602,0.2688,0.06888,B
557,0.0,0.0,0.2475,0.06969,B
475,0.3476,0.09783,0.3006,0.07802,B
308,0.01379,0.0221,0.2267,0.06192,B
553,0.07993,0.02564,0.2435,0.07393,B
159,0.01854,0.03953,0.2738,0.07685,B
290,0.222,0.1021,0.2272,0.08799,B
146,0.4504,0.1865,0.5774,0.103,M
431,0.2403,0.0737,0.2556,0.09359,B
527,0.1791,0.107,0.311,0.07592,B


In [41]:
# define a simple data frame with name and date of birth (DOB) of three individuals
df_age_example = pd.DataFrame(data={'name': ['Michael', 'John', 'Emily'], 
                                    'DOB':['28-01-1988', '19-08-2001', '23-04-2002']})
df_age_example

Unnamed: 0,name,DOB
0,Michael,28-01-1988
1,John,19-08-2001
2,Emily,23-04-2002


In [42]:
# use the pd.to_datetime() function to convert the DOB column to Pandas' datetime format
df_age_example['DOB'] = pd.to_datetime(df_age_example['DOB'], dayfirst=True)

# extract the year information from DOB column
# for more information, please refer to the documentation on the pd.to_datetime() function
df_age_example['DOB_year'] = df_age_example['DOB'].dt.year

df_age_example

Unnamed: 0,name,DOB,DOB_year
0,Michael,1988-01-28,1988
1,John,2001-08-19,2001
2,Emily,2002-04-23,2002


In [43]:
# finally determine the age of each individual
current_year = 2023
df_age_example['age'] = current_year - df_age_example['DOB_year']
df_age_example

Unnamed: 0,name,DOB,DOB_year,age
0,Michael,1988-01-28,1988,35
1,John,2001-08-19,2001,22
2,Emily,2002-04-23,2002,21


In [44]:
df.isna().sum()

mean_radius                0
mean_texture               0
mean_perimeter             0
mean_area                  0
mean_smoothness            0
mean_compactness           0
mean_concavity             0
mean_concave_points        0
mean_symmetry              0
mean_fractal_dimension     0
radius_error               0
texture_error              0
perimeter_error            0
area_error                 0
smoothness_error           0
compactness_error          0
concavity_error            0
concave_points_error       0
symmetry_error             0
fractal_dimension_error    0
worst_radius               0
worst_texture              0
worst_perimeter            0
worst_area                 0
worst_smoothness           0
worst_compactness          0
worst_concavity            0
worst_concave_points       0
worst_symmetry             0
worst_fractal_dimension    0
diagnosis                  0
dtype: int64

In [45]:
df_cat = df.copy()
df_cat['mean_area'] = pd.qcut(df_cat['mean_area'], 
                              q=3, 
                              labels=['small', 'average', 'large'])

In [46]:
df_cat['mean_area'].value_counts()

# difference in count() and value_counts(), as count() will give data for every single col and value_count
# and value_count will give count of accourance, 


mean_area
small      190
large      190
average    189
Name: count, dtype: int64

## One Hot Endcoding


In [54]:
df_cat.iloc[0:5, [1,2,3,4,5,30]]

Unnamed: 0,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,diagnosis
0,10.38,122.8,large,0.1184,0.2776,M
1,17.77,132.9,large,0.08474,0.07864,M
2,21.25,130.0,large,0.1096,0.1599,M
3,20.38,77.58,small,0.1425,0.2839,M
4,14.34,135.1,large,0.1003,0.1328,M


In [55]:
df_cat_onehot = pd.get_dummies(df_cat, columns=['mean_area'])

df_cat_onehot.iloc[:, -5:].head(5)

Unnamed: 0,worst_fractal_dimension,diagnosis,mean_area_small,mean_area_average,mean_area_large
0,0.1189,M,False,False,True
1,0.08902,M,False,False,True
2,0.08758,M,False,False,True
3,0.173,M,True,False,False
4,0.07678,M,False,False,True


## Integer Encoding

In [56]:
level_mapping = {'small': 0, 'average': 1, 'large': 2}

In [57]:
df_cat_integer = df_cat.copy()

df_cat_integer['mean_area'] = df_cat_integer['mean_area'].replace(level_mapping)

df_cat_integer.iloc[0:5, [1,2,3,4,5,30]]

Unnamed: 0,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,diagnosis
0,10.38,122.8,2,0.1184,0.2776,M
1,17.77,132.9,2,0.08474,0.07864,M
2,21.25,130.0,2,0.1096,0.1599,M
3,20.38,77.58,0,0.1425,0.2839,M
4,14.34,135.1,2,0.1003,0.1328,M


In [58]:
df_cat_integer['mean_area'].dtype

CategoricalDtype(categories=[0, 1, 2], ordered=True)