# Pandas

### Imports

In [1]:
import numpy as np
import pandas as pd

### Series
Very similar to numpy array. Difference is that a series can have labels. So other than index, we can also use labels to access the data.

#### Create a series

In [2]:
labels = ['A', 'B', 'C', 'D']
ar = np.array([100, 200, 300, 400])

In [3]:
series = pd.Series(data=ar) # No label

In [4]:
series

0    100
1    200
2    300
3    400
dtype: int32

In [5]:
series[1]

200

In [6]:
series = pd.Series(data=ar, index=labels)

In [7]:
series

A    100
B    200
C    300
D    400
dtype: int32

In [8]:
series['B']

200

In [9]:
series[1] # We can still use index

200

In [10]:
d = {
    "Michael": 75,
    "John": 80,
    "Daniel": 90
}

In [11]:
ser1 = pd.Series(d)

In [12]:
ser1

Michael    75
John       80
Daniel     90
dtype: int64

#### Adding series

In [13]:
ser2 = pd.Series([80, 90, 85], ["Michael", "John", "Arthur"])

In [14]:
ser2

Michael    80
John       90
Arthur     85
dtype: int64

In [15]:
ser1 + ser2
# NaN (Not a Number) when there is no match

Arthur       NaN
Daniel       NaN
John       170.0
Michael    155.0
dtype: float64

In [16]:
# Prevent Nan
ser1.add(ser2, fill_value=40)

Arthur     125.0
Daniel     130.0
John       170.0
Michael    155.0
dtype: float64

#### Conditional Filtering

In [17]:
ser3 = ser1.add(ser2, fill_value=0)

In [18]:
ser3

Arthur      85.0
Daniel      90.0
John       170.0
Michael    155.0
dtype: float64

In [19]:
ser3[ser3 > 100]

John       170.0
Michael    155.0
dtype: float64

### DataFrame

In [20]:
df = pd.DataFrame(
    np.random.randn(6, 5),
    index=['A', 'B', 'C', 'D', 'E', 'F'], # Row labels
    columns=['P', 'Q', 'R', 'S', 'T'] # Columns
)

In [21]:
df # Sequence of Series objects that share the same row index

Unnamed: 0,P,Q,R,S,T
A,-0.806695,-1.27273,2.134054,0.413712,-0.193174
B,-0.003349,1.806365,1.183115,1.506056,-0.56872
C,-0.28946,-0.255137,0.065503,-0.767893,-0.370247
D,2.833105,-0.203838,0.419051,1.952468,0.733121
E,-1.182193,-0.864743,-0.254925,-0.081189,0.703674
F,-0.210428,-0.234585,-0.461907,1.181142,0.622251


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, A to F
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   P       6 non-null      float64
 1   Q       6 non-null      float64
 2   R       6 non-null      float64
 3   S       6 non-null      float64
 4   T       6 non-null      float64
dtypes: float64(5)
memory usage: 288.0+ bytes


In [23]:
df.describe()

Unnamed: 0,P,Q,R,S,T
count,6.0,6.0,6.0,6.0,6.0
mean,0.05683,-0.170778,0.514148,0.700716,0.154484
std,1.427132,1.05993,0.982455,1.029254,0.59573
min,-1.182193,-1.27273,-0.461907,-0.767893,-0.56872
25%,-0.677387,-0.712341,-0.174818,0.042537,-0.325979
50%,-0.249944,-0.244861,0.242277,0.797427,0.214539
75%,-0.055119,-0.211524,0.992099,1.424828,0.683319
max,2.833105,1.806365,2.134054,1.952468,0.733121


In [24]:
# If you have too many columns
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
P,6.0,0.05683,1.427132,-1.182193,-0.677387,-0.249944,-0.055119,2.833105
Q,6.0,-0.170778,1.05993,-1.27273,-0.712341,-0.244861,-0.211524,1.806365
R,6.0,0.514148,0.982455,-0.461907,-0.174818,0.242277,0.992099,2.134054
S,6.0,0.700716,1.029254,-0.767893,0.042537,0.797427,1.424828,1.952468
T,6.0,0.154484,0.59573,-0.56872,-0.325979,0.214539,0.683319,0.733121


In [25]:
# Select a column
df['P']

A   -0.806695
B   -0.003349
C   -0.289460
D    2.833105
E   -1.182193
F   -0.210428
Name: P, dtype: float64

In [26]:
# Select multiple columns
df[['P', 'Q', 'T']]

Unnamed: 0,P,Q,T
A,-0.806695,-1.27273,-0.193174
B,-0.003349,1.806365,-0.56872
C,-0.28946,-0.255137,-0.370247
D,2.833105,-0.203838,0.733121
E,-1.182193,-0.864743,0.703674
F,-0.210428,-0.234585,0.622251


In [27]:
# Create new column
df['U'] = df['P'] + df['T']
# I am creating a new column U
# Then I am adding P and T, and putting the result in U

In [28]:
df

Unnamed: 0,P,Q,R,S,T,U
A,-0.806695,-1.27273,2.134054,0.413712,-0.193174,-0.999869
B,-0.003349,1.806365,1.183115,1.506056,-0.56872,-0.57207
C,-0.28946,-0.255137,0.065503,-0.767893,-0.370247,-0.659708
D,2.833105,-0.203838,0.419051,1.952468,0.733121,3.566226
E,-1.182193,-0.864743,-0.254925,-0.081189,0.703674,-0.478519
F,-0.210428,-0.234585,-0.461907,1.181142,0.622251,0.411824


In [29]:
# Modify column
df['U'] = df['U'] ** 2

In [30]:
df

Unnamed: 0,P,Q,R,S,T,U
A,-0.806695,-1.27273,2.134054,0.413712,-0.193174,0.999739
B,-0.003349,1.806365,1.183115,1.506056,-0.56872,0.327264
C,-0.28946,-0.255137,0.065503,-0.767893,-0.370247,0.435214
D,2.833105,-0.203838,0.419051,1.952468,0.733121,12.717966
E,-1.182193,-0.864743,-0.254925,-0.081189,0.703674,0.22898
F,-0.210428,-0.234585,-0.461907,1.181142,0.622251,0.169599


In [31]:
# Delete a column
df.drop('U', axis=1) # 0 -> Row, 1 -> Column

Unnamed: 0,P,Q,R,S,T
A,-0.806695,-1.27273,2.134054,0.413712,-0.193174
B,-0.003349,1.806365,1.183115,1.506056,-0.56872
C,-0.28946,-0.255137,0.065503,-0.767893,-0.370247
D,2.833105,-0.203838,0.419051,1.952468,0.733121
E,-1.182193,-0.864743,-0.254925,-0.081189,0.703674
F,-0.210428,-0.234585,-0.461907,1.181142,0.622251


In [32]:
# Keep in mind that it is not permanently removing the column from the orignal

In [33]:
df

Unnamed: 0,P,Q,R,S,T,U
A,-0.806695,-1.27273,2.134054,0.413712,-0.193174,0.999739
B,-0.003349,1.806365,1.183115,1.506056,-0.56872,0.327264
C,-0.28946,-0.255137,0.065503,-0.767893,-0.370247,0.435214
D,2.833105,-0.203838,0.419051,1.952468,0.733121,12.717966
E,-1.182193,-0.864743,-0.254925,-0.081189,0.703674,0.22898
F,-0.210428,-0.234585,-0.461907,1.181142,0.622251,0.169599


In [34]:
# Option 1
# df = df.drop('U', axis=1)

# Option 2
df.drop('U', axis=1, inplace=True)

# You can remove multiple columns by using a list of columns
# df.drop(['T', 'U'], axis=1, inplace=True)

In [35]:
df

Unnamed: 0,P,Q,R,S,T
A,-0.806695,-1.27273,2.134054,0.413712,-0.193174
B,-0.003349,1.806365,1.183115,1.506056,-0.56872
C,-0.28946,-0.255137,0.065503,-0.767893,-0.370247
D,2.833105,-0.203838,0.419051,1.952468,0.733121
E,-1.182193,-0.864743,-0.254925,-0.081189,0.703674
F,-0.210428,-0.234585,-0.461907,1.181142,0.622251


In [36]:
# Select row by name

In [37]:
df.loc['B']

P   -0.003349
Q    1.806365
R    1.183115
S    1.506056
T   -0.568720
Name: B, dtype: float64

In [38]:
df.loc[['B', 'C', 'E']]

Unnamed: 0,P,Q,R,S,T
B,-0.003349,1.806365,1.183115,1.506056,-0.56872
C,-0.28946,-0.255137,0.065503,-0.767893,-0.370247
E,-1.182193,-0.864743,-0.254925,-0.081189,0.703674


In [39]:
# Select row by index

In [40]:
df.iloc[0]

P   -0.806695
Q   -1.272730
R    2.134054
S    0.413712
T   -0.193174
Name: A, dtype: float64

In [41]:
df.iloc[1:-1]

Unnamed: 0,P,Q,R,S,T
B,-0.003349,1.806365,1.183115,1.506056,-0.56872
C,-0.28946,-0.255137,0.065503,-0.767893,-0.370247
D,2.833105,-0.203838,0.419051,1.952468,0.733121
E,-1.182193,-0.864743,-0.254925,-0.081189,0.703674


In [42]:
# iloc works on both rows and columns

In [43]:
df.iloc[1:-1, :3]

Unnamed: 0,P,Q,R
B,-0.003349,1.806365,1.183115
C,-0.28946,-0.255137,0.065503
D,2.833105,-0.203838,0.419051
E,-1.182193,-0.864743,-0.254925


In [44]:
df

Unnamed: 0,P,Q,R,S,T
A,-0.806695,-1.27273,2.134054,0.413712,-0.193174
B,-0.003349,1.806365,1.183115,1.506056,-0.56872
C,-0.28946,-0.255137,0.065503,-0.767893,-0.370247
D,2.833105,-0.203838,0.419051,1.952468,0.733121
E,-1.182193,-0.864743,-0.254925,-0.081189,0.703674
F,-0.210428,-0.234585,-0.461907,1.181142,0.622251


In [45]:
ar = np.random.randn(6, 5)

In [46]:
print(ar)

[[ 0.00779469 -0.61061606  0.26212     0.69922614  0.78510441]
 [ 0.98946499  0.24670915  0.90877634  0.20378303 -0.47214677]
 [ 0.78726712  0.67877846  1.27413757  1.7686707  -0.19901928]
 [-0.83529397  1.54764091  1.67054454  0.8677908  -1.70345944]
 [ 0.32125527 -0.65940598 -1.03247739 -2.86360715  0.12716765]
 [ 0.62113017  0.97412398  0.4061225   0.83343208  0.57360478]]


In [47]:
ar[1:-1, :3]

array([[ 0.98946499,  0.24670915,  0.90877634],
       [ 0.78726712,  0.67877846,  1.27413757],
       [-0.83529397,  1.54764091,  1.67054454],
       [ 0.32125527, -0.65940598, -1.03247739]])

In [48]:
df

Unnamed: 0,P,Q,R,S,T
A,-0.806695,-1.27273,2.134054,0.413712,-0.193174
B,-0.003349,1.806365,1.183115,1.506056,-0.56872
C,-0.28946,-0.255137,0.065503,-0.767893,-0.370247
D,2.833105,-0.203838,0.419051,1.952468,0.733121
E,-1.182193,-0.864743,-0.254925,-0.081189,0.703674
F,-0.210428,-0.234585,-0.461907,1.181142,0.622251


In [49]:
df.iloc[1:-1, :3]

Unnamed: 0,P,Q,R
B,-0.003349,1.806365,1.183115
C,-0.28946,-0.255137,0.065503
D,2.833105,-0.203838,0.419051
E,-1.182193,-0.864743,-0.254925


### Working With CSV

In [50]:
df = pd.read_csv('cancer.csv')

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [52]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,569.0,30371830.0,125020600.0,8670.0,869218.0,906024.0,8813129.0,911320500.0
radius_mean,569.0,14.12729,3.524049,6.981,11.7,13.37,15.78,28.11
texture_mean,569.0,19.28965,4.301036,9.71,16.17,18.84,21.8,39.28
perimeter_mean,569.0,91.96903,24.29898,43.79,75.17,86.24,104.1,188.5
area_mean,569.0,654.8891,351.9141,143.5,420.3,551.1,782.7,2501.0
smoothness_mean,569.0,0.09636028,0.01406413,0.05263,0.08637,0.09587,0.1053,0.1634
compactness_mean,569.0,0.104341,0.05281276,0.01938,0.06492,0.09263,0.1304,0.3454
concavity_mean,569.0,0.08879932,0.07971981,0.0,0.02956,0.06154,0.1307,0.4268
concave points_mean,569.0,0.04891915,0.03880284,0.0,0.02031,0.0335,0.074,0.2012
symmetry_mean,569.0,0.1811619,0.02741428,0.106,0.1619,0.1792,0.1957,0.304


In [53]:
df.head() # Show me first 5 rows

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [54]:
df.head(3)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [55]:
df.tail()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
564,926424,M,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
565,926682,M,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,...,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782
567,927241,M,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
568,92751,B,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


In [56]:
df.tail(1)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
568,92751,B,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


In [57]:
# See all the column names
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [58]:
# See index range
df.index

RangeIndex(start=0, stop=569, step=1)

In [59]:
# See a column
df['diagnosis']

0      M
1      M
2      M
3      M
4      M
      ..
564    M
565    M
566    M
567    M
568    B
Name: diagnosis, Length: 569, dtype: object

In [60]:
df[['radius_mean', 'perimeter_mean']].head()

Unnamed: 0,radius_mean,perimeter_mean
0,17.99,122.8
1,20.57,132.9
2,19.69,130.0
3,11.42,77.58
4,20.29,135.1


In [61]:
df.iloc[100:200]

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
100,862717,M,13.610,24.98,88.05,582.7,0.09488,0.08511,0.08625,0.04489,...,16.99,35.27,108.60,906.5,0.12650,0.19430,0.31690,0.11840,0.2651,0.07397
101,862722,B,6.981,13.43,43.79,143.5,0.11700,0.07568,0.00000,0.00000,...,7.93,19.54,50.41,185.2,0.15840,0.12020,0.00000,0.00000,0.2932,0.09382
102,862965,B,12.180,20.52,77.22,458.7,0.08013,0.04038,0.02383,0.01770,...,13.34,32.84,84.58,547.8,0.11230,0.08862,0.11450,0.07431,0.2694,0.06878
103,862980,B,9.876,19.40,63.95,298.3,0.10050,0.09697,0.06154,0.03029,...,10.76,26.83,72.22,361.2,0.15590,0.23020,0.26440,0.09749,0.2622,0.08490
104,862989,B,10.490,19.29,67.41,336.1,0.09989,0.08578,0.02995,0.01201,...,11.54,23.31,74.22,402.8,0.12190,0.14860,0.07987,0.03203,0.2826,0.07552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,875878,B,12.910,16.33,82.53,516.4,0.07941,0.05366,0.03873,0.02377,...,13.88,22.00,90.81,600.6,0.10970,0.15060,0.17640,0.08235,0.3024,0.06949
196,875938,M,13.770,22.29,90.63,588.9,0.12000,0.12670,0.13850,0.06526,...,16.39,34.01,111.60,806.9,0.17370,0.31220,0.38090,0.16730,0.3080,0.09333
197,877159,M,18.080,21.84,117.40,1024.0,0.07371,0.08642,0.11030,0.05778,...,19.76,24.70,129.10,1228.0,0.08822,0.19630,0.25350,0.09181,0.2369,0.06558
198,877486,M,19.180,22.49,127.50,1148.0,0.08523,0.14280,0.11140,0.06772,...,23.36,32.06,166.40,1688.0,0.13220,0.56010,0.38650,0.17080,0.3193,0.09221


In [62]:
df[['radius_mean', 'texture_mean']].iloc[100:200]

Unnamed: 0,radius_mean,texture_mean
100,13.610,24.98
101,6.981,13.43
102,12.180,20.52
103,9.876,19.40
104,10.490,19.29
...,...,...
195,12.910,16.33
196,13.770,22.29
197,18.080,21.84
198,19.180,22.49


In [63]:
df.drop('id', axis=1) # Drop column

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [64]:
# Set id as row index

In [65]:
df.set_index('id', inplace=True)

In [66]:
df.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [67]:
df.loc[842302]

diagnosis                         M
radius_mean                   17.99
texture_mean                  10.38
perimeter_mean                122.8
area_mean                    1001.0
smoothness_mean              0.1184
compactness_mean             0.2776
concavity_mean               0.3001
concave points_mean          0.1471
symmetry_mean                0.2419
fractal_dimension_mean      0.07871
radius_se                     1.095
texture_se                   0.9053
perimeter_se                  8.589
area_se                       153.4
smoothness_se              0.006399
compactness_se              0.04904
concavity_se                0.05373
concave points_se           0.01587
symmetry_se                 0.03003
fractal_dimension_se       0.006193
radius_worst                  25.38
texture_worst                 17.33
perimeter_worst               184.6
area_worst                   2019.0
smoothness_worst             0.1622
compactness_worst            0.6656
concavity_worst             

In [68]:
# Alternative to set_index
df = pd.read_csv('cancer.csv', index_col='id')
# Specify which column to use as index
# There are various other options that you can use in the read_csv function
# For example, parse_dates=True if your data has dates

In [69]:
df.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Conditional Filtering

In [70]:
df['diagnosis'] == 'M' # Condition

id
842302       True
842517       True
84300903     True
84348301     True
84358402     True
            ...  
926424       True
926682       True
926954       True
927241       True
92751       False
Name: diagnosis, Length: 569, dtype: bool

In [71]:
df[df['diagnosis'] == 'M'] # Select all rows where 'diagnosis' is Malignant

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.38,17.33,184.60,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.11890
842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.99,23.41,158.80,1956.0,0.1238,0.1866,0.2416,0.1860,0.2750,0.08902
84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.57,25.53,152.50,1709.0,0.1444,0.4245,0.4504,0.2430,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.91,26.50,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.17300
84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.54,16.67,152.20,1575.0,0.1374,0.2050,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926125,M,20.92,25.09,143.00,1347.0,0.10990,0.22360,0.31740,0.14740,0.2149,...,24.29,29.41,179.10,1819.0,0.1407,0.4186,0.6599,0.2542,0.2929,0.09873
926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.45,26.40,166.10,2027.0,0.1410,0.2113,0.4107,0.2216,0.2060,0.07115
926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.69,38.25,155.00,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.98,34.12,126.70,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.07820


In [72]:
df[df['diagnosis'] == 'M'][['radius_mean', 'texture_mean']]

Unnamed: 0_level_0,radius_mean,texture_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1
842302,17.99,10.38
842517,20.57,17.77
84300903,19.69,21.25
84348301,11.42,20.38
84358402,20.29,14.34
...,...,...
926125,20.92,25.09
926424,21.56,22.39
926682,20.13,28.25
926954,16.60,28.08


In [73]:
df[(df['radius_mean'] > 15) & (df['perimeter_mean'] > 125)]

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.1860,0.2750,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.10960,0.15990,0.1974,0.12790,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.2430,0.3613,0.08758
84358402,M,20.29,14.34,135.1,1297.0,0.10030,0.13280,0.1980,0.10430,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.2050,0.4000,0.1625,0.2364,0.07678
846226,M,19.17,24.80,132.4,1123.0,0.09740,0.24580,0.2065,0.11180,0.2397,...,20.96,29.94,151.7,1332.0,0.1037,0.3903,0.3639,0.1767,0.3176,0.10230
849014,M,19.81,22.15,130.0,1260.0,0.09831,0.10270,0.1479,0.09498,0.1582,...,27.32,30.88,186.8,2398.0,0.1512,0.3150,0.5372,0.2388,0.2768,0.07615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919555,M,20.55,20.86,137.8,1308.0,0.10460,0.17390,0.2085,0.13220,0.2127,...,24.30,25.48,160.2,1809.0,0.1268,0.3135,0.4433,0.2148,0.3077,0.07569
926125,M,20.92,25.09,143.0,1347.0,0.10990,0.22360,0.3174,0.14740,0.2149,...,24.29,29.41,179.1,1819.0,0.1407,0.4186,0.6599,0.2542,0.2929,0.09873
926424,M,21.56,22.39,142.0,1479.0,0.11100,0.11590,0.2439,0.13890,0.1726,...,25.45,26.40,166.1,2027.0,0.1410,0.2113,0.4107,0.2216,0.2060,0.07115
926682,M,20.13,28.25,131.2,1261.0,0.09780,0.10340,0.1440,0.09791,0.1752,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637


#### Sort

In [74]:
df.sort_values('radius_mean') # Sort the rows by the value of ascending radius mean

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862722,B,6.981,13.43,43.79,143.5,0.11700,0.07568,0.00000,0.000000,0.1930,...,7.930,19.54,50.41,185.2,0.15840,0.12020,0.0000,0.00000,0.2932,0.09382
921362,B,7.691,25.44,48.34,170.4,0.08668,0.11990,0.09252,0.013640,0.2037,...,8.678,31.89,54.49,223.6,0.15960,0.30640,0.3393,0.05000,0.2790,0.10660
921092,B,7.729,25.49,47.98,178.8,0.08098,0.04878,0.00000,0.000000,0.1870,...,9.077,30.92,57.17,248.0,0.12560,0.08340,0.0000,0.00000,0.3058,0.09938
92751,B,7.760,24.54,47.92,181.0,0.05263,0.04362,0.00000,0.000000,0.1587,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0000,0.00000,0.2871,0.07039
85713702,B,8.196,16.84,51.71,201.9,0.08600,0.05943,0.01588,0.005917,0.1769,...,8.964,21.96,57.26,242.2,0.12970,0.13570,0.0688,0.02564,0.3105,0.07409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8611555,M,25.220,24.91,171.50,1878.0,0.10630,0.26650,0.33390,0.184500,0.1829,...,30.000,33.62,211.70,2562.0,0.15730,0.60760,0.6476,0.28670,0.2355,0.10510
899987,M,25.730,17.46,174.20,2010.0,0.11490,0.23630,0.33680,0.191300,0.1956,...,33.130,23.58,229.30,3234.0,0.15300,0.59370,0.6451,0.27560,0.3690,0.08815
873592,M,27.220,21.87,182.10,2250.0,0.10940,0.19140,0.28710,0.187800,0.1800,...,33.120,32.85,220.80,3216.0,0.14720,0.40340,0.5340,0.26880,0.2856,0.08082
911296202,M,27.420,26.27,186.90,2501.0,0.10840,0.19880,0.36350,0.168900,0.2061,...,36.040,31.37,251.20,4254.0,0.13570,0.42560,0.6833,0.26250,0.2641,0.07427


In [75]:
df.sort_values(['diagnosis', 'radius_mean'])
# First sort by diagnosis
# For all equal values of diagnosis, sort by radius_mean

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862722,B,6.981,13.43,43.79,143.5,0.11700,0.07568,0.00000,0.000000,0.1930,...,7.930,19.54,50.41,185.2,0.15840,0.12020,0.0000,0.00000,0.2932,0.09382
921362,B,7.691,25.44,48.34,170.4,0.08668,0.11990,0.09252,0.013640,0.2037,...,8.678,31.89,54.49,223.6,0.15960,0.30640,0.3393,0.05000,0.2790,0.10660
921092,B,7.729,25.49,47.98,178.8,0.08098,0.04878,0.00000,0.000000,0.1870,...,9.077,30.92,57.17,248.0,0.12560,0.08340,0.0000,0.00000,0.3058,0.09938
92751,B,7.760,24.54,47.92,181.0,0.05263,0.04362,0.00000,0.000000,0.1587,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0000,0.00000,0.2871,0.07039
85713702,B,8.196,16.84,51.71,201.9,0.08600,0.05943,0.01588,0.005917,0.1769,...,8.964,21.96,57.26,242.2,0.12970,0.13570,0.0688,0.02564,0.3105,0.07409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8611555,M,25.220,24.91,171.50,1878.0,0.10630,0.26650,0.33390,0.184500,0.1829,...,30.000,33.62,211.70,2562.0,0.15730,0.60760,0.6476,0.28670,0.2355,0.10510
899987,M,25.730,17.46,174.20,2010.0,0.11490,0.23630,0.33680,0.191300,0.1956,...,33.130,23.58,229.30,3234.0,0.15300,0.59370,0.6451,0.27560,0.3690,0.08815
873592,M,27.220,21.87,182.10,2250.0,0.10940,0.19140,0.28710,0.187800,0.1800,...,33.120,32.85,220.80,3216.0,0.14720,0.40340,0.5340,0.26880,0.2856,0.08082
911296202,M,27.420,26.27,186.90,2501.0,0.10840,0.19880,0.36350,0.168900,0.2061,...,36.040,31.37,251.20,4254.0,0.13570,0.42560,0.6833,0.26250,0.2641,0.07427


#### apply

In [76]:
df['diagnosis_label'] = df['diagnosis'].apply(lambda d: 'Benign' if d == 'B' else 'Malignant')

In [78]:
df.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis_label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,Malignant
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,Malignant
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,Malignant
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,Malignant
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,Malignant


#### corr

In [79]:
df.corr()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
radius_mean,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,-0.311631,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
texture_mean,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,0.071401,-0.076437,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
perimeter_mean,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,-0.261477,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
area_mean,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,-0.28311,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
smoothness_mean,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,0.557775,0.584792,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
compactness_mean,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,0.602641,0.565369,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
concavity_mean,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,0.336783,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
concave points_mean,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,0.166917,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661
symmetry_mean,0.147741,0.071401,0.183027,0.151293,0.557775,0.602641,0.500667,0.462497,1.0,0.479921,...,0.185728,0.090651,0.219169,0.177193,0.426675,0.4732,0.433721,0.430297,0.699826,0.438413
fractal_dimension_mean,-0.311631,-0.076437,-0.261477,-0.28311,0.584792,0.565369,0.336783,0.166917,0.479921,1.0,...,-0.253691,-0.051269,-0.205151,-0.231854,0.504942,0.458798,0.346234,0.175325,0.334019,0.767297


#### max

In [80]:
df['radius_mean'].max() # The maximum value in the column

28.11

In [81]:
# Write the code to view the rows where the radis_mean is maximum

In [82]:
df[df['radius_mean'] == df['radius_mean'].max()]

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis_label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8810703,M,28.11,18.47,188.5,2499.0,0.1142,0.1516,0.3201,0.1595,0.1648,...,18.47,188.5,2499.0,0.1142,0.1516,0.3201,0.1595,0.1648,0.05525,Malignant


#### value_counts

In [83]:
df['diagnosis'].value_counts() # Show the count of each unique value in column

B    357
M    212
Name: diagnosis, dtype: int64

#### unique

In [84]:
df['diagnosis_label'].unique() # Show the unique values

array(['Malignant', 'Benign'], dtype=object)

In [85]:
len(df['diagnosis_label'].unique())

2

In [86]:
df['diagnosis_label'].nunique() # No. of unique values

2

#### replace

In [87]:
df['diagnosis_label'] = df['diagnosis_label'].replace(['Malignant', 'Benign'], ['m', 'b'])
# 1st list: Values to be replaced
# 2nd list: Values to replace with

In [88]:
df.tail()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis_label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
926424,M,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,m
926682,M,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,m
926954,M,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,m
927241,M,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,m
92751,B,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,b


### duplicates

In [89]:
df.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis_label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,m
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,m
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,m
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,m
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,m


In [90]:
df.duplicated() # Gives you a series indicating which row is a duplicate

id
842302      False
842517      False
84300903    False
84348301    False
84358402    False
            ...  
926424      False
926682      False
926954      False
927241      False
92751       False
Length: 569, dtype: bool

In [91]:
df.drop_duplicates(inplace=True) # Remove the duplicate rows

### GroupBy
Group dataset by one or more column.

In [92]:
df.groupby('diagnosis').describe()['radius_mean']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
B,357.0,12.146524,1.780512,6.981,11.08,12.2,13.37,17.85
M,212.0,17.46283,3.203971,10.95,15.075,17.325,19.59,28.11


In [93]:
df.groupby('diagnosis').mean()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,0.062867,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442
M,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,0.06268,...,21.134811,29.318208,141.37033,1422.286321,0.144845,0.374824,0.450606,0.182237,0.323468,0.09153


In [94]:
df.groupby('diagnosis').min()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis_label
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.05185,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1566,0.05521,b
M,10.95,10.38,71.9,361.6,0.07371,0.04605,0.02398,0.02031,0.1308,0.04996,...,16.67,85.1,508.1,0.08822,0.05131,0.02398,0.02899,0.1565,0.05504,m
