In [8]:
# Importing all necessary libraries for data loading and inspection

import pandas as pd            
import numpy as np            
import os                      # OS helps us work with file paths and check if files exist
import warnings                # Used to hide unnecessary warnings for a clean notebook

# Ignore warnings to avoid cluttering the notebook output
warnings.filterwarnings("ignore")

print("Libraries imported successfully.")


Libraries imported successfully.


In [9]:

data_path = "../data/sample.csv"

# Check whether the file exists at the given path
if os.path.exists(data_path):
    print(f"Dataset found at: {data_path}")
else:
    print("‚ö†Ô∏è Dataset NOT found! Please check the file path.")


Dataset found at: ../data/sample.csv


In [10]:
# Function to safely load a CSV file into a pandas DataFrame

def load_csv(file_path):
    """
    Loads a CSV file and returns a pandas DataFrame.
    Handles common errors and prints user-friendly messages.
    """
    
    # Check file extension to ensure it's a CSV
    if not file_path.endswith(".csv"):
        print(" Error: The selected file is not a CSV file.")
        return None
    
    # Check if the file exists
    if not os.path.exists(file_path):
        print(" Error: File not found at the given path.")
        return None
    
    try:
        # Try reading the CSV file
        df = pd.read_csv(file_path)
        print(f"‚úî CSV loaded successfully! Shape: {df.shape}")
        return df
    
    except Exception as e:
        # Catch any unexpected errors during reading
        print(" Error while reading the CSV file:")
        print(e)
        return None

# Testing the function on our dataset
data = load_csv(data_path)
data.head()  # Show the first few rows


‚úî CSV loaded successfully! Shape: (150, 6)


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [11]:

if data is not None:
    
    print(" Preview of the first 5 rows:")
    display(data.head())  
    
    print("\n Preview of the last 5 rows:")
    display(data.tail())  
    
    print("\n Dataset Info:")
    print(data.info())     # Show column types and non-null counts
    
    print("\n Missing Values in Each Column:")
    print(data.isnull().sum())  # Count missing values in each column
    
else:
    print(" Cannot inspect data because the CSV failed to load.")


 Preview of the first 5 rows:


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa



 Preview of the last 5 rows:


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica



 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None

 Missing Values in Each Column:
Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [12]:


if data is not None:

    print("üìå Dataset Shape (rows, columns):", data.shape)

    print("\n Column Names:")
    print(list(data.columns))

    print("\n Descriptive Statistics:")
    display(data.describe())  # Shows mean, std, min, max for numeric columns

    print("\n Unique Values per Column:")
    for col in data.columns:
        print(f"{col}: {data[col].nunique()} unique values")

    print("\n Sample Distribution for Each Column:")
    for col in data.columns:
        print(f"\n--- {col} ---")
        print(data[col].value_counts().head(5))  # Show top 5 most frequent values

else:
    print(" Cannot compute summary statistics because the data was not loaded.")


üìå Dataset Shape (rows, columns): (150, 6)

 Column Names:
['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']

 Descriptive Statistics:


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5



 Unique Values per Column:
Id: 150 unique values
SepalLengthCm: 35 unique values
SepalWidthCm: 23 unique values
PetalLengthCm: 43 unique values
PetalWidthCm: 22 unique values
Species: 3 unique values

 Sample Distribution for Each Column:

--- Id ---
Id
1    1
2    1
3    1
4    1
5    1
Name: count, dtype: int64

--- SepalLengthCm ---
SepalLengthCm
5.0    10
6.3     9
5.1     9
6.7     8
5.7     8
Name: count, dtype: int64

--- SepalWidthCm ---
SepalWidthCm
3.0    26
2.8    14
3.2    13
3.4    12
3.1    12
Name: count, dtype: int64

--- PetalLengthCm ---
PetalLengthCm
1.5    14
1.4    12
4.5     8
5.1     8
1.3     7
Name: count, dtype: int64

--- PetalWidthCm ---
PetalWidthCm
0.2    28
1.3    13
1.5    12
1.8    12
1.4     8
Name: count, dtype: int64

--- Species ---
Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64


In [13]:
# Basic data validation checks to ensure dataset is usable

def validate_dataset(df):
    """Runs basic validation checks on the dataset."""

    # Check 1: DataFrame should not be empty
    if df.empty:
        print(" Error: The dataset is empty.")
        return False

    # Check 2: Check for duplicate column names
    if df.columns.duplicated().any():
        print(" Error: Duplicate column names found.")
        return False

    # Check 3: Ensure we have at least 2 columns (features + possible target)
    if df.shape[1] < 2:
        print(" Error: The dataset must have at least 2 columns.")
        return False

    # Check 4: Ensure dataset has more than 1 row
    if df.shape[0] < 2:
        print(" Error: The dataset must have more than 1 row.")
        return False

    # Check 5: Identify completely empty columns
    empty_cols = df.columns[df.isnull().all()].tolist()
    if empty_cols:
        print(f"‚ö†Ô∏è Warning: These columns are entirely empty: {empty_cols}")

    # Check 6: Identify ID-like columns (all unique values)
    id_like_cols = [col for col in df.columns if df[col].nunique() == len(df)]
    if id_like_cols:
        print(f" Info: Potential ID columns detected: {id_like_cols}")

    print("‚úî Dataset validation passed.")
    return True


# Run validation on the loaded data
if data is not None:
    validate_dataset(data)
else:
    print(" No dataset loaded.")


 Info: Potential ID columns detected: ['Id']
‚úî Dataset validation passed.


In [14]:
# Master function to load + validate the dataset

def load_and_validate(file_path):
    """
    Full pipeline: loads the CSV, validates it, and returns a clean DataFrame.
    """

    print("üì• Step 1: Loading dataset...")
    df = load_csv(file_path)

    if df is None:
        print(" Failed at loading stage.")
        return None

    print("\nüìã Step 2: Validating dataset structure...")
    if not validate_dataset(df):
        print(" Dataset validation failed.")
        return None

    print("\nüìä Step 3: Showing basic dataset overview...")
    
    # Display previews only if dataset is valid
    display(df.head())
    display(df.describe())

    print("\n Dataset successfully loaded and validated!")
    return df


# Run the full loader on our sample data
final_df = load_and_validate(data_path)
final_df.head()


üì• Step 1: Loading dataset...
‚úî CSV loaded successfully! Shape: (150, 6)

üìã Step 2: Validating dataset structure...
 Info: Potential ID columns detected: ['Id']
‚úî Dataset validation passed.

üìä Step 3: Showing basic dataset overview...


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5



 Dataset successfully loaded and validated!


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
