In [62]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer # used for handling missing data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.preprocessing import StandardScaler # used for feature scaling

In [63]:
df = pd.read_csv("/content/drive/MyDrive/melanoma.csv")

In [64]:
df.head()

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,year,thickness,ulcer
0,1,10.0,3,1.0,76.0,1972,6.76,1
1,2,30.0,3,1.0,56.0,1968,0.65,0
2,3,35.0,2,1.0,41.0,1977,1.34,0
3,4,99.0,3,0.0,71.0,1968,2.9,0
4,5,185.0,1,1.0,52.0,1965,12.08,1


Check for missing values

In [65]:
df.columns[df.isna().any()].tolist()

['time', 'sex', 'age']

In [66]:
 dropped_df = df[df['time'].notna()]

Dropping rows where time has an nan value

In [67]:
dropped_df.head()

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,year,thickness,ulcer
0,1,10.0,3,1.0,76.0,1972,6.76,1
1,2,30.0,3,1.0,56.0,1968,0.65,0
2,3,35.0,2,1.0,41.0,1977,1.34,0
3,4,99.0,3,0.0,71.0,1968,2.9,0
4,5,185.0,1,1.0,52.0,1965,12.08,1


In [90]:
print('Average values of each column:\n')
print('Time: ',df["time"].mean())
print('\nAge: ',df["age"].mean())
print("\nThickness: ",df["thickness"].mean())

Average values of each column:

Time:  2142.855

Age:  52.537313432835816

Thickness:  2.9198536585365873


Filling missing values with mean and median

In [91]:
df.fillna(df.mean(), inplace=True)

print(df.loc[74])


Unnamed: 0         75
time          2142.86
status              1
sex              Male
age                58
year             1970
thickness        0.97
ulcer               1
avg           300.853
Name: 74, dtype: object


Renaming the sex attribute's values

In [71]:
df['sex'] = df['sex'].map({1: 'Female', 0: 'Male'})

In [72]:
df.head()

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,year,thickness,ulcer,avg
0,1,10.0,3,Female,76.0,1972,6.76,1,258.845
1,2,30.0,3,Female,56.0,1968,0.65,0,257.58125
2,3,35.0,2,Female,41.0,1977,1.34,0,257.5425
3,4,99.0,3,Male,71.0,1968,2.9,0,268.4875
4,5,185.0,1,Female,52.0,1965,12.08,1,277.76


Correlation

In [92]:
df.corr(method ='pearson') 


Unnamed: 0.1,Unnamed: 0,time,status,age,year,thickness,ulcer,avg
Unnamed: 0,1.0,0.96784,0.340901,-0.289196,-0.441657,-0.245014,-0.252307,0.923235
time,0.96784,1.0,0.311175,-0.314214,-0.482761,-0.233915,-0.273785,0.970411
status,0.340901,0.311175,1.0,0.00405,0.138167,-0.204722,-0.270326,0.298969
age,-0.289196,-0.314214,0.00405,1.0,0.179961,0.218035,0.141499,-0.299205
year,-0.441657,-0.482761,0.138167,0.179961,1.0,-0.133345,-0.033126,-0.472641
thickness,-0.245014,-0.233915,-0.204722,0.218035,-0.133345,1.0,0.424459,-0.196908
ulcer,-0.252307,-0.273785,-0.270326,0.141499,-0.033126,0.424459,1.0,-0.28616
avg,0.923235,0.970411,0.298969,-0.299205,-0.472641,-0.196908,-0.28616,1.0
