# Data Understanding

### Loading and Inspecting Dataset

In [17]:
import pandas as pd

# Raw URL to the Blackmore.csv file
raw_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/carData/Blackmore.csv"

# Load the CSV directly into pandas
data = pd.read_csv(raw_url)
print(data.head())


   rownames subject    age  exercise    group
0         1     100   8.00      2.71  patient
1         2     100  10.00      1.94  patient
2         3     100  12.00      2.36  patient
3         4     100  14.00      1.54  patient
4         5     100  15.92      8.63  patient


In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# GitHub directory page URL
base_url = "https://github.com/vincentarelbundock/Rdatasets/tree/master/csv/carData"

# Step 1: Scrape the directory listing
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all links on the repository page
file_links = soup.find_all('a', href=True)

# Locate 'Blackmore.csv'
csv_file_link = None
for link in file_links:
    if "Blackmore.csv" in link['href']:
        csv_file_link = link['href']
        break

if csv_file_link:
    # Step 2: Convert the relative link to the raw URL
    raw_url = csv_file_link.replace('/blob/', '/raw/')
    raw_url = f"https://github.com{raw_url}"
    
    # Step 3: Load the CSV file into pandas
    data = pd.read_csv(raw_url)
    print(data.head())
else:
    print("Blackmore.csv not found in the repository!")


   rownames subject    age  exercise    group
0         1     100   8.00      2.71  patient
1         2     100  10.00      1.94  patient
2         3     100  12.00      2.36  patient
3         4     100  14.00      1.54  patient
4         5     100  15.92      8.63  patient


In [19]:
#information about our dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945 entries, 0 to 944
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   rownames  945 non-null    int64  
 1   subject   945 non-null    object 
 2   age       945 non-null    float64
 3   exercise  945 non-null    float64
 4   group     945 non-null    object 
dtypes: float64(2), int64(1), object(2)
memory usage: 37.0+ KB


In [20]:
#statistical summary of our dataset
data.describe()

Unnamed: 0,rownames,age,exercise
count,945.0,945.0,945.0
mean,473.0,11.441661,2.530646
std,272.942302,2.765609,3.495086
min,1.0,8.0,0.0
25%,237.0,10.0,0.4
50%,473.0,12.0,1.33
75%,709.0,14.0,3.04
max,945.0,17.92,29.96


In [21]:
#check the shape of our dataset i.e 945 rows and 5 columns
data.shape

(945, 5)

In [22]:
#display the names of columns in the dataset
data.columns

Index(['rownames', 'subject', 'age', 'exercise', 'group'], dtype='object')

In [23]:
#check the datatypes of our columns
data.dtypes

rownames      int64
subject      object
age         float64
exercise    float64
group        object
dtype: object

# Data Cleaning

Check for missing  values 

In [25]:
#check for missing values
data.isnull().sum()

rownames    0
subject     0
age         0
exercise    0
group       0
dtype: int64

From our observation above, our dataset does not hae any missing values.

In [27]:
#checking if our dataset has any duplicates
data.duplicated().sum()

0