In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Medicaldataset.csv')

In [3]:
print(df.head(5))

   Age  Gender  Heart rate  Systolic blood pressure  Diastolic blood pressure  \
0   64  Female        66.0                    160.0                      83.0   
1   21  Female        94.0                     98.0                      46.0   
2   55  Female        64.0                    160.0                       NaN   
3   64  Female        70.0                    120.0                      55.0   
4   55  Female        64.0                    112.0                      65.0   

   Blood sugar  CK-MB  Troponin    Result  
0        160.0   1.80     0.012  negative  
1        296.0   6.75     1.060  positive  
2        270.0   1.99     0.003  negative  
3        270.0  13.87     0.122  positive  
4        300.0   1.08     0.003  negative  


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       999 non-null    int64  
 1   Gender                    999 non-null    object 
 2   Heart rate                997 non-null    float64
 3   Systolic blood pressure   993 non-null    float64
 4   Diastolic blood pressure  993 non-null    float64
 5   Blood sugar               998 non-null    float64
 6   CK-MB                     999 non-null    float64
 7   Troponin                  999 non-null    float64
 8   Result                    999 non-null    object 
dtypes: float64(6), int64(1), object(2)
memory usage: 70.4+ KB


In [5]:
df.isnull().sum()

Age                         0
Gender                      0
Heart rate                  2
Systolic blood pressure     6
Diastolic blood pressure    6
Blood sugar                 1
CK-MB                       0
Troponin                    0
Result                      0
dtype: int64

In [6]:
numeric_cols = df.select_dtypes(include=['number'])
non_numeric_cols = df.select_dtypes(exclude=['number'])

numeric_cols.fillna(numeric_cols.mean(), inplace=True)

non_numeric_cols.fillna(non_numeric_cols.mode().iloc[0], inplace=True)

df = pd.concat([numeric_cols, non_numeric_cols], axis=1)
df.isnull().sum()

Age                         0
Heart rate                  0
Systolic blood pressure     0
Diastolic blood pressure    0
Blood sugar                 0
CK-MB                       0
Troponin                    0
Gender                      0
Result                      0
dtype: int64

In [7]:
rows = df.iloc[100:111]
rows

Unnamed: 0,Age,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Gender,Result
100,71,71.0,119.0,76.0,159.0,0.468,0.029,Female,positive
101,53,73.0,135.0,81.0,115.0,165.1,0.014,Female,positive
102,43,68.0,116.0,74.0,81.0,1.64,0.015,Male,positive
103,66,70.0,113.0,62.0,266.0,300.0,0.012,Male,positive
104,67,87.0,148.0,89.0,142.0,1.87,0.01,Female,negative
105,51,85.0,140.0,82.0,101.0,1.69,0.008,Male,negative
106,50,83.0,140.0,81.0,244.0,3.27,2.23,Female,positive
107,67,82.0,164.0,90.0,130.0,3.75,0.009,Female,negative
108,59,81.0,150.0,51.0,117.0,1.51,1.55,Female,positive
109,20,60.0,156.0,60.0,103.0,5.22,1.84,Female,positive


In [8]:
df['Gender'] = pd.to_numeric(df['Gender'].replace({'Male': 0, 'Female': 1}), downcast='integer')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       999 non-null    int64  
 1   Heart rate                999 non-null    float64
 2   Systolic blood pressure   999 non-null    float64
 3   Diastolic blood pressure  999 non-null    float64
 4   Blood sugar               999 non-null    float64
 5   CK-MB                     999 non-null    float64
 6   Troponin                  999 non-null    float64
 7   Gender                    999 non-null    int8   
 8   Result                    999 non-null    object 
dtypes: float64(6), int64(1), int8(1), object(1)
memory usage: 63.5+ KB


  df['Gender'] = pd.to_numeric(df['Gender'].replace({'Male': 0, 'Female': 1}), downcast='integer')


In [9]:
newDF = df.copy()

In [10]:
newDF.describe()

Unnamed: 0,Age,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Gender
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,56.333333,77.76329,127.313192,72.145015,147.123948,15.989179,0.345756,0.667668
std,13.229817,48.625292,25.800134,13.865411,73.416461,47.760004,1.026621,0.471285
min,14.0,20.0,42.0,38.0,35.0,0.321,0.001,0.0
25%,48.0,64.0,110.0,62.0,98.0,1.655,0.006,0.0
50%,58.0,74.0,125.0,72.0,117.0,2.91,0.014,1.0
75%,65.0,84.5,143.0,81.0,174.0,5.815,0.0995,1.0
max,100.0,1111.0,223.0,128.0,541.0,300.0,10.3,1.0


In [11]:
filter_patients = newDF[(newDF['Age'] > 50) & (newDF['Result'] == 'Negative')]
filter_patients

Unnamed: 0,Age,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Gender,Result


In [12]:
numeric_df = newDF.select_dtypes(include=['number'])
correlation_matrix = numeric_df.corr()
correlation_matrix

Unnamed: 0,Age,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Gender
Age,1.0,0.010721,0.025219,0.015241,0.001028,0.002831,0.055168,-0.055979
Heart rate,0.010721,1.0,0.008797,0.071444,-0.008874,-0.015889,0.028176,-0.01769
Systolic blood pressure,0.025219,0.008797,1.0,0.605781,0.028258,-0.0105,0.041078,0.001955
Diastolic blood pressure,0.015241,0.071444,0.605781,1.0,-0.011773,-0.025068,0.036813,-0.031566
Blood sugar,0.001028,-0.008874,0.028258,-0.011773,1.0,0.03867,0.027697,0.011713
CK-MB,0.002831,-0.015889,-0.0105,-0.025068,0.03867,1.0,-0.006962,0.019584
Troponin,0.055168,0.028176,0.041078,0.036813,0.027697,-0.006962,1.0,0.103128
Gender,-0.055979,-0.01769,0.001955,-0.031566,0.011713,0.019584,0.103128,1.0


In [13]:
newDF['Heart rate'].value_counts()

Heart rate
60.0     71
61.0     42
70.0     38
64.0     37
80.0     37
         ..
104.0     1
49.0      1
46.0      1
135.0     1
20.0      1
Name: count, Length: 78, dtype: int64

In [14]:
newDF['Age'].value_counts()

Age
60     86
70     56
50     55
63     49
55     48
       ..
91      1
88      1
100     1
23      1
14      1
Name: count, Length: 72, dtype: int64

In [15]:
newDF.to_excel('new_medical_dataset.xlsx', index=False)