In [1]:
# load the libraries
import os
import pandas as pd
import numpy as np 
# load the datasets
# we are going to load using the data frames
df1 = pd.read_csv('data.csv')
# df2 = pd.read_csv('Sales.csv')

In [3]:
# display the first few rows of the data set
df1.head(5)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0


In [4]:
# Data preprocessing
# Data cleaning
# 1 Empty values cell that has no data
# 2 Wrong formats    30/09/2098  30092098
# 3 Wrong data       Age 23     -23
# 4 Duplicates       Record repeating its self


In [5]:
# Empty values
# check for empty values

print(df1.isnull().sum()) 

Duration    0
Pulse       0
Maxpulse    0
Calories    5
dtype: int64


In [6]:
# checking for wrong formats
print(df1.dtypes)

Duration      int64
Pulse         int64
Maxpulse      int64
Calories    float64
dtype: object


In [8]:
# check for non numeric values in the columns
for col in ['Duration', 'Pulse', 'Maxpulse', 'Calories']:
    if not pd.to_numeric(df1[col], errors='coerce').notnull().all():
        print(f"Column '{col}' contains non-numeric values.")

Column 'Calories' contains non-numeric values.


In [13]:
# wrong data
for col in ['Duration', 'Pulse', 'Maxpulse', 'Calories']:
    if (df1[col]< 0).any():
        print(f"Column '{col}' contains negative values.")

In [None]:
# Duplicates
duplicates = df1.duplicated()
if duplicates.any():
    print("The dataframe contains duplicates.")
    print(df1[duplicates])
else:
    print("The dataframe does not contain duplicates.")
    
# we only remove duplicates in rows

The dataframe contains duplicates.
     Duration  Pulse  Maxpulse  Calories
36         60    102       127     300.0
37         60    100       120     300.0
38         60    100       120     300.0
40         45     90       112     180.1
71         60    109       153     387.6
113        45    100       120     225.3
155        60    111       151     368.5


In [None]:
# recommendation
# 1. Handle empty values (fill)
# 2. Convert wrong formats
# 3. Correct wrong data
# 4. Remove duplicates in rows not columns

In [16]:
# Fill empty values with mean for numeric columns
# loop through the columns
for col in ['Duration', 'Pulse', 'Maxpulse', 'Calories']:
    # fill in with the mean
    df1[col].fillna(df1[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1[col].fillna(df1[col].mean(), inplace=True)


In [18]:
df1.head(30)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
5,60,102,127,300.0
6,60,110,136,374.0
7,45,104,134,253.3
8,30,109,133,195.1
9,60,98,124,269.0


In [19]:
print(df1.isnull().sum())

Duration    0
Pulse       0
Maxpulse    0
Calories    0
dtype: int64


In [20]:
# EDA Exploratory Data Analysis
# checks the shape of the data set
print("The shape of the data set is :", df1.shape)

The shape of the data set is : (169, 4)


In [None]:
# The summary of the statistics of the data set
print("the summary is")
print(df1.describe())

the summary is
         Duration       Pulse    Maxpulse     Calories
count  169.000000  169.000000  169.000000   169.000000
mean    63.846154  107.461538  134.047337   375.790244
std     42.299949   14.510259   16.450434   262.385991
min     15.000000   80.000000  100.000000    50.300000
25%     45.000000  100.000000  124.000000   253.300000
50%     60.000000  105.000000  131.000000   321.000000
75%     60.000000  111.000000  141.000000   384.000000
max    300.000000  159.000000  184.000000  1860.400000


: 