In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
column_names = [
    "Class", "Alcohol", "Malic_acid", "Ash", "Alcalinity_of_ash",
    "Magnesium", "Total_phenols", "Flavanoids", "Nonflavanoid_phenols",
    "Proanthocyanins", "Color_intensity", "Hue", "OD280_OD315", "Proline"
]
data = pd.read_csv(url, header=None, names=column_names)

# Step 1: Understanding the data
print("Dataset Info:")
print(data.info())
print("\nFirst few rows:")
print(data.head())

# Step 2: Handle missing values (if any)
print("\nCheck for missing values:")
print(data.isnull().sum())
# If there are missing values, you can fill them or remove rows/columns:
# data.fillna(method='ffill', inplace=True)

# Step 3: Remove duplicates
data.drop_duplicates(inplace=True)
print("\nCheck for duplicates removed:")
print(data.shape)

# Step 4: Standardize the numerical features
scaler = StandardScaler()
numerical_features = data.drop(columns=["Class"])  # Exclude the target column for scaling
data_scaled = pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)

# Combine scaled data with the target column
data_cleaned = pd.concat([data["Class"], data_scaled], axis=1)

# Step 5: Validate the data
print("\nCleaned and Prepared Data:")
print(data_cleaned.head())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Class                 178 non-null    int64  
 1   Alcohol               178 non-null    float64
 2   Malic_acid            178 non-null    float64
 3   Ash                   178 non-null    float64
 4   Alcalinity_of_ash     178 non-null    float64
 5   Magnesium             178 non-null    int64  
 6   Total_phenols         178 non-null    float64
 7   Flavanoids            178 non-null    float64
 8   Nonflavanoid_phenols  178 non-null    float64
 9   Proanthocyanins       178 non-null    float64
 10  Color_intensity       178 non-null    float64
 11  Hue                   178 non-null    float64
 12  OD280_OD315           178 non-null    float64
 13  Proline               178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB
None

First f

In [4]:
data_cleaned.to_csv('wine_data_cleaned.csv', index=False)
from google.colab import files
files.download('wine_data_cleaned.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>