In [3]:
import pandas as pd
df = pd.read_csv('cereals.csv')

# Display the 5-number summary for all numeric attributes
summary = df.describe().loc[['min', '25%', '50%', '75%', 'max']]
print(summary)


     calories  protein  fat  sodium  fiber  carbo  sugars  potass  vitamins  \
min      50.0      1.0  0.0     0.0   0.00   -1.0    -1.0    -1.0       0.0   
25%     100.0      2.0  0.0   132.5   0.75   12.0     3.0    40.0      25.0   
50%     110.0      2.5  1.0   180.0   1.75   14.5     7.0    90.0      25.0   
75%     110.0      3.0  2.0   212.5   3.00   17.0    11.0   120.0      25.0   
max     160.0      6.0  5.0   320.0  14.00   23.0    15.0   330.0     100.0   

     shelf  weight  cups     rating  
min    1.0     0.5  0.25  18.042851  
25%    1.0     1.0  0.67  32.932466  
50%    2.0     1.0  0.75  40.253086  
75%    3.0     1.0  1.00  50.780847  
max    3.0     1.5  1.50  93.704912  


In [13]:
# Replace -1 with NA to identify missing values
df.replace(-1, pd.NA, inplace=True)

# Calculate the mean for each numeric attribute and replace NA with the mean
numeric_columns = df.select_dtypes(include='number').columns
for column in numeric_columns:
    mean_value = df[column].mean()
    df[column] = df[column].fillna(mean_value)

print(df)


                      name mfr type  calories  protein  fat  sodium  fiber  \
0           100NaturalBran   Q    C       120        3    5      15    2.0   
1                 All-Bran   K    C        70        4    1     260    9.0   
2   All-BranwithExtraFiber   K    C        50        4    0     140   14.0   
3            AlmondDelight   R    C       110        2    2     200    1.0   
4    AppleCinnamonCheerios   G    C       110        2    2     180    1.5   
..                     ...  ..  ...       ...      ...  ...     ...    ...   
71                 Triples   G    C       110        2    1     250    0.0   
72                    Trix   G    C       110        1    1     140    0.0   
73               WheatChex   R    C       100        3    1     230    3.0   
74                Wheaties   G    C       100        3    1     200    3.0   
75       WheatiesHoneyGold   G    C       110        2    1     200    1.0   

   carbo sugars potass  vitamins  shelf  weight  cups     ratin

In [15]:
# Display the 5-number summary for all numeric attributes after treating missing values
summary_after_missing = df.describe().loc[['min', '25%', '50%', '75%', 'max']]
print(summary_after_missing)


     calories  protein  fat  sodium  fiber  vitamins  shelf  weight  cups  \
min      50.0      1.0  0.0     0.0   0.00       0.0    1.0     0.5  0.25   
25%     100.0      2.0  0.0   132.5   0.75      25.0    1.0     1.0  0.67   
50%     110.0      2.5  1.0   180.0   1.75      25.0    2.0     1.0  0.75   
75%     110.0      3.0  2.0   212.5   3.00      25.0    3.0     1.0  1.00   
max     160.0      6.0  5.0   320.0  14.00     100.0    3.0     1.5  1.50   

        rating  
min  18.042851  
25%  32.932466  
50%  40.253086  
75%  50.780847  
max  93.704912  


In [19]:
# Function to replace outliers with the median
def replace_outliers_with_median(column):
    median = column.median()
    q1 = column.quantile(0.25)
    q3 = column.quantile(0.75)
    #IQR=inter quartile range
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    column = column.apply(lambda x: median if x < lower_bound or x > upper_bound else x)
    return column

numeric_columns = df.select_dtypes(include='number').columns
for column in numeric_columns:
    df[column] = replace_outliers_with_median(df[column])

print(df)


                      name mfr type  calories  protein  fat  sodium  fiber  \
0           100NaturalBran   Q    C       120      3.0    5   180.0   2.00   
1                 All-Bran   K    C       110      4.0    1   260.0   1.75   
2   All-BranwithExtraFiber   K    C       110      4.0    0   140.0   1.75   
3            AlmondDelight   R    C       110      2.0    2   200.0   1.00   
4    AppleCinnamonCheerios   G    C       110      2.0    2   180.0   1.50   
..                     ...  ..  ...       ...      ...  ...     ...    ...   
71                 Triples   G    C       110      2.0    1   250.0   0.00   
72                    Trix   G    C       110      1.0    1   140.0   0.00   
73               WheatChex   R    C       100      3.0    1   230.0   3.00   
74                Wheaties   G    C       100      3.0    1   200.0   3.00   
75       WheatiesHoneyGold   G    C       110      2.0    1   200.0   1.00   

   carbo sugars potass  vitamins  shelf  weight  cups     ratin

In [21]:
# after treating outliers
summary_after_noisy = df.describe().loc[['min', '25%', '50%', '75%', 'max']]
print(summary_after_noisy)

     calories  protein  fat  sodium  fiber  vitamins  shelf  weight  cups  \
min      90.0     1.00  0.0    70.0  0.000      25.0    1.0     1.0  0.25   
25%     100.0     2.00  0.0   157.5  0.750      25.0    1.0     1.0  0.67   
50%     110.0     2.25  1.0   180.0  1.625      25.0    2.0     1.0  0.75   
75%     110.0     3.00  2.0   210.0  3.000      25.0    3.0     1.0  1.00   
max     120.0     4.00  5.0   290.0  6.000      25.0    3.0     1.0  1.33   

        rating  
min  18.042851  
25%  32.932466  
50%  40.179526  
75%  50.031833  
max  74.472949  
