In [1]:
# 1. Import all the required Python Libraries.
import pandas as pd
import numpy as np

# 2. Locate an open source data from the web.
# In this example, I'll use the Iris dataset available at UCI ML Repository.
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

# 3. Load the Dataset into pandas data frame.
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_df = pd.read_csv(url, names=column_names)

# Display the first few rows of the dataset to verify the import.
print("First few rows of the Iris dataset:")
print(iris_df.head())

# 4. Data Preprocessing:
# Check for missing values using pandas info(), describe() functions.
print("\nInformation about the dataset:")
print(iris_df.info())

print("\nDescriptive statistics of the dataset:")
print(iris_df.describe())

# Variable Descriptions:
# - Sepal Length, Sepal Width, Petal Length, Petal Width: Numeric variables.
# - Class: Categorical variable representing the species of iris flowers.

# Check the dimensions of the data frame.
print("\nDimensions of the dataset (rows, columns):", iris_df.shape)

# 5. Data Formatting and Normalization:
# Summarize the types of variables by checking data types.
print("\nData Types of Variables:")
print(iris_df.dtypes)

# Ensure that numeric variables are in the correct data type.
# In this case, they are already in the correct data types (float64).

# 6. Turn categorical variables into quantitative variables.
# The 'class' variable is categorical; we can use one-hot encoding to convert it to quantitative.
iris_df = pd.get_dummies(iris_df, columns=['class'], drop_first=True)

# Display the updated dataframe.
print("\nUpdated DataFrame after one-hot encoding:")
print(iris_df.head())


First few rows of the Iris dataset:
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa

Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None

Descriptive statistics of the dataset:
       sepal_len

In [3]:
#Step 1
#import libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
#Step 2
#Load Dataset into dataframe
data_url = "iris.csv"
df = pd.read_csv(data_url)
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
#Step 3
#Data preprocessing
missing_value=df.isnull().sum()
print("Missing value= ",missing_value)


Missing value=  Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [11]:
description=df.describe()
print(description)


               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.800000       1.600000      0.300000
50%     75.500000       5.800000      3.000000       4.350000      1.300000
75%    112.750000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000


In [13]:

variable_descriptions = {
 "sepal_length_cm": "Length of the sepal (in centimeters)",
 "sepal_width_cm": "Width of the sepal (in centimeters)",
 "petal_length_cm": "Length of the petal (in centimeters)",
 "petal_width_cm": "Width of the petal (in centimeters)",
 "species": "Species of the iris flower (setosa, versicolor, or virginica)"}
print("Variable description of dataset: ",variable_descriptions)

Variable description of dataset:  {'sepal_length_cm': 'Length of the sepal (in centimeters)', 'sepal_width_cm': 'Width of the sepal (in centimeters)', 'petal_length_cm': 'Length of the petal (in centimeters)', 'petal_width_cm': 'Width of the petal (in centimeters)', 'species': 'Species of the iris flower (setosa, versicolor, or virginica)'}


In [14]:
dimensions=df.shape
print("Dimensions of dataset: ",dimensions)


Dimensions of dataset:  (150, 6)


In [15]:
#Step 4
# Data Formatting and Data Normalization
variable_types=df.dtypes
print("Variable types: ",variable_types)
#from sklearn.preproccesing import LabelEncoder
encoder = preprocessing.LabelEncoder()
df["Species"]=encoder.fit_transform(df["Species"])
df


Variable types:  Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,2
146,147,6.3,2.5,5.0,1.9,2
147,148,6.5,3.0,5.2,2.0,2
148,149,6.2,3.4,5.4,2.3,2


In [16]:
print(df.columns)

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


In [17]:

variable_types=df.dtypes
print(variable_types)

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species            int32
dtype: object
