# Datasets:
*  Iris Dataset (https://archive.ics.uci.edu/ml/datasets/iris)
*  Wine Dataset (https://archive.ics.uci.edu/ml/datasets/wine)
*  Glass Dataset (https://archive.ics.uci.edu/ml/datasets/glass+identification)

# Iris Dataset Preparation

In [3]:
import pandas as pd
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame from the feature data and target labels
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['target_name'] = [iris.target_names[i] for i in iris.target]

# Save the DataFrame as a CSV file
df.to_csv('datasets/iris_dataset.csv', index=False)


In [4]:
df_iris = pd.read_csv('datasets/iris_dataset.csv')
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_name
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [9]:
# Check the number of rows and columns
print(f"Shape: \n{df_iris.shape}")
# seperate print statements with * to make it easier to read
print('*' * 30)
# Check the data types
print(f"Datatypes: \n{df_iris.dtypes}")
print('*' * 30)
# Check the missing values
print(f"Missing values: \n{df_iris.isnull().sum()}")

Shape: 
(150, 6)
******************************
Datatypes: 
sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target                 int64
target_name           object
dtype: object
******************************
Missing values: 
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
target_name          0
dtype: int64


# Wine Dataset Preparation

In [10]:
import pandas as pd
from sklearn.datasets import load_wine

# Load the Wine dataset
wine = load_wine()

# Create a DataFrame from the feature data and target labels
df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
df['target'] = wine.target
df['target_name'] = [wine.target_names[i] for i in wine.target]

# Save the DataFrame as a CSV file
df.to_csv('datasets/wine_dataset.csv', index=False)

In [11]:
df_wine = pd.read_csv('datasets/wine_dataset.csv')
df_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target,target_name
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0,class_0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0,class_0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0,class_0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0,class_0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0,class_0


In [12]:
# Check the number of rows and columns
print(f"Shape: \n{df_wine.shape}")
# seperate print statements with * to make it easier to read
print('*' * 30)
# Check the data types
print(f"Datatypes: \n{df_wine.dtypes}")
print('*' * 30)
# Check the missing values
print(f"Missing values: \n{df_wine.isnull().sum()}")

Shape: 
(178, 15)
******************************
Datatypes: 
alcohol                         float64
malic_acid                      float64
ash                             float64
alcalinity_of_ash               float64
magnesium                       float64
total_phenols                   float64
flavanoids                      float64
nonflavanoid_phenols            float64
proanthocyanins                 float64
color_intensity                 float64
hue                             float64
od280/od315_of_diluted_wines    float64
proline                         float64
target                            int64
target_name                      object
dtype: object
******************************
Missing values: 
alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proan

# Glass Dataset Preparation

In [23]:
import urllib.request

url = 'https://github.com/deric/clustering-benchmark/raw/master/src/main/resources/datasets/real-world/glass.arff'
filename = 'glass.arff'

urllib.request.urlretrieve(url, filename)

('glass.arff', <http.client.HTTPMessage at 0x1cf86e235b0>)

In [24]:
from scipy.io import arff
from sklearn.preprocessing import LabelEncoder

# Load the ARFF file
data = arff.loadarff('glass.arff')

# Convert to a pandas DataFrame
df = pd.DataFrame(data[0])
df['Class'] = df['Class'].str.decode('utf-8')
df['target'] = LabelEncoder().fit_transform(df['Class'])
# Change column name 'K' to K
df.rename(columns={"'K'": 'K',"Class" : "target_name" }, inplace=True)
# Save as CSV file
df.to_csv('datasets/glass_dataset.csv', index=False)

In [25]:
df_glass = pd.read_csv('datasets/glass_dataset.csv')
df_glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,target_name,target
0,1.51567,13.29,3.45,1.21,72.74,0.56,8.57,0.0,0.0,build_wind_float,0
1,1.52667,13.99,3.7,0.71,71.57,0.02,9.82,0.0,0.1,build_wind_float,0
2,1.51514,14.01,2.68,3.5,69.89,1.68,5.87,2.2,0.0,containers,2
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,build_wind_float,0
4,1.52068,13.55,2.09,1.67,72.18,0.53,9.57,0.27,0.17,build_wind_non-float,1


In [26]:
import os

file_path = 'glass.arff'

if os.path.exists(file_path):
    os.remove(file_path)
    print(f"The file '{file_path}' has been successfully removed.")
else:
    print(f"The file '{file_path}' does not exist.")

The file 'glass.arff' has been successfully removed.


In [27]:
# Check the number of rows and columns
print(f"Shape: \n{df_glass.shape}")
# seperate print statements with * to make it easier to read
print('*' * 30)
# Check the data types
print(f"Datatypes: \n{df_glass.dtypes}")
print('*' * 30)
# Check the missing values
print(f"Missing values: \n{df_glass.isnull().sum()}")

Shape: 
(214, 11)
******************************
Datatypes: 
RI             float64
Na             float64
Mg             float64
Al             float64
Si             float64
K              float64
Ca             float64
Ba             float64
Fe             float64
target_name     object
target           int64
dtype: object
******************************
Missing values: 
RI             0
Na             0
Mg             0
Al             0
Si             0
K              0
Ca             0
Ba             0
Fe             0
target_name    0
target         0
dtype: int64
