In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/heartbeat-sounds/set_a.csv
/kaggle/input/heartbeat-sounds/set_a_timing.csv
/kaggle/input/heartbeat-sounds/set_b.csv
/kaggle/input/heartbeat-sounds/set_a/normal__201103140132.wav
/kaggle/input/heartbeat-sounds/set_a/murmur__201108222252.wav
/kaggle/input/heartbeat-sounds/set_a/Aunlabelledtest__201103200518.wav
/kaggle/input/heartbeat-sounds/set_a/artifact__201106221254.wav
/kaggle/input/heartbeat-sounds/set_a/Aunlabelledtest__201108011117.wav
/kaggle/input/heartbeat-sounds/set_a/normal__201103170121.wav
/kaggle/input/heartbeat-sounds/set_a/normal__201103101140.wav
/kaggle/input/heartbeat-sounds/set_a/extrahls__201101152255.wav
/kaggle/input/heartbeat-sounds/set_a/normal__201103151912.wav
/kaggle/input/heartbeat-sounds/set_a/artifact__201012172012.wav
/kaggle/input/heartbeat-sounds/set_a/Aunlabelledtest__201106212102.wav
/kaggle/input/heartbeat-sounds/set_a/normal__201106221418.wav
/kaggle/input/heartbeat-sounds/set_a/artifact__201106031558.wav
/kaggle/input/heartbeat-sound

## What makes a time series?
1. An array of numbers that represents the data itself.
2. An array that contains a timestamp for each datapoint.

In [None]:
import pandas as pd 
import maptlotlib.pyplot as plt
data = pd.read_csv('data.csv')
data.head()

In [None]:
fig,ax = plt.subplot(figsize= (12,6))
data.plot('date','close',ax=ax)
ax.set(title='AAPL daily closing price')


### Why machine learning?
- We can use really big data and really complicated data
- Predict the future
- Automate this process

### Machine learning pipeline
- Feature extraction
- Model fitting
- Prediction and validation

In [None]:
# Plot the time series in each dataset
fig, axs = plt.subplots(2, 1, figsize=(5, 10))
data.iloc[:1000].plot(x='time', y='data_values', ax=axs[0])
data2.iloc[:1000].plot(x='time', y='data_values', ax=axs[1])
plt.show()

### Always visualize your data
Histograms and scatterplots are a good place to start
```python
# Using matplotlib
fig,ax = plt.subplots()
ax.plot(...)

# Using pandas
fig,ax = plt.subplots()
df.plot(..., ax=ax)
```

Look at the distribution of your data :
- Are there any outliers ?
- Are you missing data ?
- It is reasonable?

In [None]:
from sklearn.svm import LinearSVC

### Preparing data for scikit-learn
- scikit-learn expects a particular structure of data: (samples, features)
- Make sure that your data is at least two-dimensional
- Make sure that the first dimension is samples 

### If your data is not shaped properly
- If the axes are swapped:
```python
array.T.shape```
- If we're missing an axis, use .reshape():
```python
array.reshape([-1,1]).shape
```
    - -1 will automatically fill that axis with remaining values

There is one coefficient per input feature
```python
model.coef_```

### The heartbeat Acoustic Data
- Many recordings of heart sounds from different patients
- Some had normally-functioning hearts, others had abnormalities
- Data comes in the form of audio files+labels for each file
- Can we find the 'abnormal' heart beats?

In [8]:
from glob import glob
files_a = glob('/kaggle/input/heartbeat-sounds/set_a/*.wav')
files_b = glob('/kaggle/input/heartbeat-sounds/set_b/*.wav')

print(files_a)
print(files_b)


['/kaggle/input/heartbeat-sounds/set_a/normal__201103140132.wav', '/kaggle/input/heartbeat-sounds/set_a/murmur__201108222252.wav', '/kaggle/input/heartbeat-sounds/set_a/Aunlabelledtest__201103200518.wav', '/kaggle/input/heartbeat-sounds/set_a/artifact__201106221254.wav', '/kaggle/input/heartbeat-sounds/set_a/Aunlabelledtest__201108011117.wav', '/kaggle/input/heartbeat-sounds/set_a/normal__201103170121.wav', '/kaggle/input/heartbeat-sounds/set_a/normal__201103101140.wav', '/kaggle/input/heartbeat-sounds/set_a/extrahls__201101152255.wav', '/kaggle/input/heartbeat-sounds/set_a/normal__201103151912.wav', '/kaggle/input/heartbeat-sounds/set_a/artifact__201012172012.wav', '/kaggle/input/heartbeat-sounds/set_a/Aunlabelledtest__201106212102.wav', '/kaggle/input/heartbeat-sounds/set_a/normal__201106221418.wav', '/kaggle/input/heartbeat-sounds/set_a/artifact__201106031558.wav', '/kaggle/input/heartbeat-sounds/set_a/Aunlabelledtest__201101110659.wav', '/kaggle/input/heartbeat-sounds/set_a/artifac

### Reading in auditory data

In [None]:
! apt-get install -y libsndfile-dev

In [12]:
import librosa as lr
# 'load' accepts a path to an audio file
audio, sfreq = lr.load('/kaggle/input/heartbeat-sounds/set_a/normal__201103140132.wav')
print(sfreq)

22050


Meaning the sample frequency is 22050, so there are 22050 samples per second

and the data is stored in audio.

### Inferring times from samples
- Using only the sampling frequency, we can infer the timepoint of each data point in our audio file.
- Note: this assumes the sampling rate is fixed and no data points are lost

### Creating a time array

In [23]:
# Create an array of indices, one for each sample, and divide by the sampling frequency
indices = np.arange(0, len(audio))
time = indices/ sfreq

In [24]:
# Find the time stamp for the N-1th data point. Then use linspace() to interpolate from zero to that time
final_time = (len(audio) -1 )/ sfreq
time = np.linspace(0, final_time, sfreq)

In [25]:
final_time

array([0.00000000e+00, 4.08179720e-04, 8.16359440e-04, ...,
       8.99913829e+00, 8.99954647e+00, 8.99995465e+00])