**Leitura do arquivo CSV**

In [1]:
import pandas as pd
import numpy as np

df= pd.read_csv(
    'dados.csv',
    sep= ";",
    engine="python",
)

n= np.random.randint(5,15)

print("Arquivo lido com sucesso.")

display(df.head(n))

Arquivo lido com sucesso.


Unnamed: 0,ID,Duration,Date,Pulse,Maxpulse,Calories
0,0,60,'2020/12/01',110,130,4091.0
1,1,60,'2020/12/02',117,145,4790.0
2,2,60,'2020/12/03',103,135,3400.0
3,3,45,'2020/12/04',109,175,2824.0
4,4,45,'2020/12/05',117,148,4060.0
5,5,60,'2020/12/06',102,127,3000.0
6,6,60,'2020/12/07',110,136,3740.0


**Verificação inicial dos dados**

In [2]:
print("Informações gerais do dataset original: ")
display(df.info())

print(f"\nPrimeiras {n} linhas:")
display(df.head(n))

print(f"\nÚltimas {n} linhas:")
display(df.tail(n))


Informações gerais do dataset original: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        32 non-null     int64  
 1   Duration  32 non-null     int64  
 2   Date      31 non-null     object 
 3   Pulse     32 non-null     int64  
 4   Maxpulse  32 non-null     int64  
 5   Calories  30 non-null     float64
dtypes: float64(1), int64(4), object(1)
memory usage: 1.6+ KB


None


Primeiras 7 linhas:


Unnamed: 0,ID,Duration,Date,Pulse,Maxpulse,Calories
0,0,60,'2020/12/01',110,130,4091.0
1,1,60,'2020/12/02',117,145,4790.0
2,2,60,'2020/12/03',103,135,3400.0
3,3,45,'2020/12/04',109,175,2824.0
4,4,45,'2020/12/05',117,148,4060.0
5,5,60,'2020/12/06',102,127,3000.0
6,6,60,'2020/12/07',110,136,3740.0



Últimas 7 linhas:


Unnamed: 0,ID,Duration,Date,Pulse,Maxpulse,Calories
25,25,60,'2020/12/25',102,126,3345.0
26,26,60,20201226,100,120,2500.0
27,27,60,'2020/12/27',92,118,2410.0
28,28,60,'2020/12/28',103,132,
29,29,60,'2020/12/29',100,132,2800.0
30,30,60,'2020/12/30',102,129,3803.0
31,31,60,'2020/12/31',92,115,2430.0


**Tratamento de valores nulos**

Coluna: *Calories*

In [3]:
df_clean= df.copy()
df_clean["Calories"] = df_clean["Calories"].fillna(0)
print("Valores nulos em 'Calories' substituídos por 0: ")
display(df_clean[df_clean['Calories']==0].head())

Valores nulos em 'Calories' substituídos por 0: 


Unnamed: 0,ID,Duration,Date,Pulse,Maxpulse,Calories
18,18,45,'2020/12/18',90,112,0.0
28,28,60,'2020/12/28',103,132,0.0


Coluna: *Date*

In [4]:
df_clean["Date"]= df_clean["Date"].fillna("1900/01/01")

df_clean['Date'] = (
    df_clean['Date']
    .str.replace("'", "")  # Remove aspas
    .replace('20201226', '2020/12/26')  # Corrige formato específico
    .replace('1900/01/01', np.nan)  # Substitui placeholder por NaN
)
df_clean['Date']

0     2020/12/01
1     2020/12/02
2     2020/12/03
3     2020/12/04
4     2020/12/05
5     2020/12/06
6     2020/12/07
7     2020/12/08
8     2020/12/09
9     2020/12/10
10    2020/12/11
11    2020/12/12
12    2020/12/12
13    2020/12/13
14    2020/12/14
15    2020/12/15
16    2020/12/16
17    2020/12/17
18    2020/12/18
19    2020/12/19
20    2020/12/20
21    2020/12/21
22           NaN
23    2020/12/23
24    2020/12/24
25    2020/12/25
26    2020/12/26
27    2020/12/27
28    2020/12/28
29    2020/12/29
30    2020/12/30
31    2020/12/31
Name: Date, dtype: object

**Correção de Formato de Datas**

In [9]:
df_clean['Date'] = pd.to_datetime(
    df_clean['Date'], 
    errors='coerce', 
    format='%Y/%m/%d'
)
display(df_clean['Date'])

0    2020-12-01
1    2020-12-02
2    2020-12-03
3    2020-12-04
4    2020-12-05
5    2020-12-06
6    2020-12-07
7    2020-12-08
8    2020-12-09
9    2020-12-10
10   2020-12-11
11   2020-12-12
12   2020-12-12
13   2020-12-13
14   2020-12-14
15   2020-12-15
16   2020-12-16
17   2020-12-17
18   2020-12-18
19   2020-12-19
20   2020-12-20
21   2020-12-21
22          NaT
23   2020-12-23
24   2020-12-24
25   2020-12-25
26   2020-12-26
27   2020-12-27
28   2020-12-28
29   2020-12-29
30   2020-12-30
31   2020-12-31
Name: Date, dtype: datetime64[ns]

**Remoção de Registros Inválidos**


In [6]:
invalid_dates = df_clean[df_clean['Date'].isna()]
print("\n=== Registros inválidos removidos ===")
display(invalid_dates)
df_final = df_clean.dropna(subset=['Date'])
display(df_final)


=== Registros inválidos removidos ===


Unnamed: 0,ID,Duration,Date,Pulse,Maxpulse,Calories
22,22,45,NaT,100,119,2820.0


Unnamed: 0,ID,Duration,Date,Pulse,Maxpulse,Calories
0,0,60,2020-12-01,110,130,4091.0
1,1,60,2020-12-02,117,145,4790.0
2,2,60,2020-12-03,103,135,3400.0
3,3,45,2020-12-04,109,175,2824.0
4,4,45,2020-12-05,117,148,4060.0
5,5,60,2020-12-06,102,127,3000.0
6,6,60,2020-12-07,110,136,3740.0
7,7,450,2020-12-08,104,134,2533.0
8,8,30,2020-12-09,109,133,1951.0
9,9,60,2020-12-10,98,124,2690.0


In [12]:
print("\n=== Dataset final ===")
print(f"Total de registros válidos: {len(df_final)}")
print("\nAmostra final:")
display(df_final)


=== Dataset final ===
Total de registros válidos: 31
Total de registros válidos: 32

Amostra final:


Unnamed: 0,ID,Duration,Date,Pulse,Maxpulse,Calories
0,0,60,2020-12-01,110,130,4091.0
1,1,60,2020-12-02,117,145,4790.0
2,2,60,2020-12-03,103,135,3400.0
3,3,45,2020-12-04,109,175,2824.0
4,4,45,2020-12-05,117,148,4060.0
5,5,60,2020-12-06,102,127,3000.0
6,6,60,2020-12-07,110,136,3740.0
7,7,450,2020-12-08,104,134,2533.0
8,8,30,2020-12-09,109,133,1951.0
9,9,60,2020-12-10,98,124,2690.0
