In [1]:
import pandas as pd
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

df = pd.read_csv('Iris.csv', header=None ,names=column_names)
numeric_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

In [2]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
Id,,,,,Species
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa


In [3]:
df.tail()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
146,6.7,3.0,5.2,2.3,Iris-virginica
147,6.3,2.5,5.0,1.9,Iris-virginica
148,6.5,3.0,5.2,2.0,Iris-virginica
149,6.2,3.4,5.4,2.3,Iris-virginica
150,5.9,3.0,5.1,1.8,Iris-virginica


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 151 entries, Id to 150
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       151 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.1+ KB


In [5]:
df.isnull()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
Id,True,True,True,True,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
146,False,False,False,False,False
147,False,False,False,False,False
148,False,False,False,False,False
149,False,False,False,False,False


In [6]:
df.dtypes


sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [7]:
# 1. How do you load a CSV file into a Pandas DataFrame?
# You use pd.read_csv('Iris.csv') to load a CSV file into a data frame.
# 2. What information does the info() function provide about the dataset?
# It provides the number of rows and columns, column names, data types, non-null counts for each column, and memory usage.
# 3. How can you identify missing values in the dataset?
# Use isnull().sum() to check for missing values. It will return the count of missing values in each column.

In [8]:
# Convert the numeric columns to proper numeric values, coercing errors to NaN
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Now fill NaN values with median or mean
df_numeric = df[numeric_columns].fillna(df[numeric_columns].median())

print("\nData set after missing values:")
print(df.isnull().sum())



Data set after missing values:
sepal_length    1
sepal_width     1
petal_length    1
petal_width     1
species         0
dtype: int64


In [9]:
df[numeric_columns] = df[numeric_columns].ffill()

In [10]:
# What strategy did you use to handle missing values, and why?
# For data with very few missing values, dropna() can remove rows directly. Using fillna() with the mean or median for numerical columns helps avoid data loss and keeps value consistency. In time series, ffill or bfill preserves trends.
# How did filling in missing values affect the dataset?
# If values are missing completely at random, the data sample is likely still representative of the population. However if the values are missing systematically, analysis may be biased.
# When might it be more appropriate to drop rows with missing values instead of filling them?
# One common approach to handling missing values is to delete them from the dataset. This approach is appropriate when the number of missing values is small and randomly distributed across the dataset.

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_numeric)

In [12]:
df_encoded = pd.get_dummies(df['species'])

In [13]:
df['sepal_length'] = pd.to_numeric(df['sepal_length'], errors='coerce')
df['sepal_length_binned'] = pd.cut(df['sepal_length'], bins=3, labels=['Short', 'Medium', 'Long'])
print(df[['sepal_length', 'sepal_length_binned']].tail(10))

     sepal_length sepal_length_binned
141           6.7              Medium
142           6.9                Long
143           5.8              Medium
144           6.8                Long
145           6.7              Medium
146           6.7              Medium
147           6.3              Medium
148           6.5              Medium
149           6.2              Medium
150           5.9              Medium


In [14]:
#Questions
# 1. What is the difference between normalization and standardization?
# Normalization (Min-Max Scaling) rescales the values ​​of a numeric column so that they lie between 0 and 1. 
# Standardization (Z-score scaling) centers the data around a mean of 0 with a standard deviation of 1. 
# 2. How does one-hot encoding transform categorical variables?
# One-hot encoding transforms categorical variables by creating a new binary column for each category. Each new column has a value of 1 if the original variable was that category, and 0 otherwise.
# 3. Why might you want to bin continuous variables into categories?
# To simplify the model, reduce sensitivity to outliers, or use algorithms that require categorical data.

In [15]:
#Ex4

In [16]:
df['petal_area'] = df['petal_length'] * df['petal_width']

In [17]:
# Thre is no any datasets that related with datetime!


In [18]:
df['petal_ratio'] = df['petal_length'] / df['petal_width']

In [19]:
# Questions:
# What new features did you create, and why?
# I created the total area for operation and analysis 
# # How did the new features improve the dataset?
# # How can date-based features be useful in a dataset?


In [20]:
#Ex5

In [21]:
duplicates = df.duplicated().sum()

In [22]:
df_cleaned = df.drop_duplicates()

In [23]:
from scipy import stats
import numpy as np
df_no_outliers = df[(np.abs(stats.zscore(df_numeric)) < 3).all(axis=1)]

In [24]:
df['species'] = df['species'].str.lower()

In [25]:
# 1. How did you identify and handle duplicate rows in the dataset?
# Using the df.duplicated() function, which detects duplicates. Then remove them using drop_duplicates().
# 2. What method did you use to detect and remove outliers, and why?
# Depending on your data you can use whatever you like but I used Zscore method  to figure out how far away a piece of data is from the average of a group, measured in standard deviations
# 3. How did you address inconsistencies in categorical data?
#By standardizing text (for example, lowercase) and combining similar categories.

In [26]:
#Ex6

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [91]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split



In [95]:

column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

df = pd.read_csv('Iris.csv', header=None ,names=column_names)



In [97]:
X = df.drop('species', axis=1)
y = df['species']

In [99]:
print("Columns in X:")
print(X.columns)

Columns in X:
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')


In [101]:
# Split the data into training and testing sets (70-30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the sizes of the splits
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


Training set size: 105
Testing set size: 46


In [103]:
# 80-20 split
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"80-20 split - Training set size: {X_train_80.shape[0]}, Testing set size: {X_test_80.shape[0]}")


80-20 split - Training set size: 120, Testing set size: 31


In [105]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [107]:
print("Data types in X_train:")
print(X_train.dtypes)

Data types in X_train:
sepal_length    object
sepal_width     object
petal_length    object
petal_width     object
dtype: object


In [109]:
#EX7

In [111]:
# Load the dataset
data = pd.read_csv('Iris.csv')

# Display the first few rows
data.head()


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [113]:
X = df.drop('species', axis=1)
y = df['species']


In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [117]:
# Assuming the dataset has mixed types
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()


In [119]:
# Create a preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
    ('scaler', StandardScaler())                   # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Fill missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))                      # One-hot encode categorical features
])

# Combine the two transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [121]:
# Create a complete pipeline with preprocessing and model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))  # Example model
])


In [123]:
model_pipeline.fit(X_train, y_train)


In [125]:
# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        19
Iris-versicolor       1.00      0.77      0.87        13
 Iris-virginica       0.82      1.00      0.90        14

       accuracy                           0.93        46
      macro avg       0.94      0.92      0.92        46
   weighted avg       0.95      0.93      0.93        46



In [127]:
# 1.A data preprocessing pipeline automates the steps of data preparation, such as handling missing values, scaling features, and encoding categorical variables. It ensures that these steps are consistently applied to both training and testing data, reducing the chance of errors.
# 2.Categorical variables can be handled using encoders like OneHotEncoder, which converts categorical values into a format that can be provided to machine learning algorithms.
# 3.Feature scaling helps to standardize the range of independent variables or features of data.