In [55]:
import pandas as pd

def process_dataset(file_name):
    print(f"\n===== Processing: {file_name} =====\n")

    # i) Read the dataset
    try:
        df = pd.read_csv(file_name)
    except FileNotFoundError:
        print(f"File {file_name} not found.\n")
        return

    # ii) Display first 5 and last 5 records
    print("First 5 records:\n", df.head())
    print("\nLast 5 records:\n", df.tail())

    # iii) Display info and describe the dataset
    print("\n--- Info ---")
    df.info()
    print("\n--- Description ---")
    print(df.describe(include='all'))

    # iv) Find out null values
    print("\n--- Null values ---")
    print(df.isnull().sum())

    # v) Fill the null values based on data type
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if df[col].dtype == 'object':
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col] = df[col].fillna(df[col].mean())

    print("\n--- Null values after filling ---")
    print(df.isnull().sum())

    # vi) Convert categorical columns into numerical columns
    df_encoded = pd.get_dummies(df, drop_first=True)
    print("\n--- Encoded Data Sample (first 5 rows) ---")
    print(df_encoded.head())

# List of datasets to process
datasets = ["claimants.csv", "framingham.csv", "iris.csv", "churn.csv"]

for data in datasets:
    process_dataset(data)



===== Processing: claimants.csv =====

First 5 records:
    ATTORNEY  CLMSEX  CLMINSUR  SEATBELT  CLMAGE    LOSS
0         0     0.0       1.0       0.0    50.0  34.940
1         1     1.0       0.0       0.0    18.0   0.891
2         1     0.0       1.0       0.0     5.0   0.330
3         0     0.0       1.0       1.0    31.0   0.037
4         1     0.0       1.0       0.0    30.0   0.038

Last 5 records:
       ATTORNEY  CLMSEX  CLMINSUR  SEATBELT  CLMAGE   LOSS
1335         1     0.0       1.0       0.0     NaN  0.576
1336         0     1.0       1.0       0.0    46.0  3.705
1337         1     1.0       1.0       0.0    39.0  0.099
1338         0     1.0       0.0       0.0     8.0  3.177
1339         1     1.0       1.0       0.0    30.0  0.688

--- Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ATTORNEY  1340 non-null   int64  
 1 