
#  Data Wrangling 

Written by: M.Danish Azeem\
Date: 01.04.2024\
Email: danishazeem365@gmail.com

In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [20]:
df = sns.load_dataset("iris")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [22]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [23]:
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [24]:
df.isnull().sum() / len(df) * 100

sepal_length    0.0
sepal_width     0.0
petal_length    0.0
petal_width     0.0
species         0.0
dtype: float64

In [25]:
df["species"] = df["species"].astype("category")

In [26]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [27]:
df.boxplot()

<Axes: >

In [28]:
df.shape

(150, 5)

In [29]:
# Remove outliers using the IQR method
Q1 = df['sepal_width'].quantile(0.25)
Q3 = df['sepal_width'].quantile(0.75)
IQR = Q3 - Q1
IQR
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['sepal_width'] > lower_bound) & (df['sepal_width'] < upper_bound)]  

In [30]:
df.boxplot()

<Axes: >

In [31]:
df.duplicated().sum()

1

In [32]:
df_dublicates = df[df.duplicated()]
df_dublicates.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
142,5.8,2.7,5.1,1.9,virginica


In [33]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [34]:
df.shape

(145, 5)

In [35]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [40]:
df["petal_area"] = df["petal_length"] * df["petal_width"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["petal_area"] = df["petal_length"] * df["petal_width"]


In [41]:
df["species_first_letter"] = df["species"].str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["species_first_letter"] = df["species"].str[0]


In [43]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() ; df[["sepal_length", "sepal_width"]] = scaler.fit_transform(df[["sepal_length", "sepal_width"]])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scaler = MinMaxScaler() ; df[["sepal_length", "sepal_width"]] = scaler.fit_transform(df[["sepal_length", "sepal_width"]])


In [45]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,petal_area,species_first_letter
0,0.222222,0.722222,1.4,0.2,setosa,0.28,s
1,0.166667,0.444444,1.4,0.2,setosa,0.28,s
2,0.111111,0.555556,1.3,0.2,setosa,0.26,s
3,0.083333,0.500000,1.5,0.2,setosa,0.30,s
4,0.194444,0.777778,1.4,0.2,setosa,0.28,s
...,...,...,...,...,...,...,...
145,0.666667,0.444444,5.2,2.3,virginica,11.96,v
146,0.555556,0.166667,5.0,1.9,virginica,9.50,v
147,0.611111,0.444444,5.2,2.0,virginica,10.40,v
148,0.527778,0.666667,5.4,2.3,virginica,12.42,v


In [46]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,petal_area
count,145.0,145.0,145.0,145.0,145.0
mean,0.432567,0.468199,3.797931,1.215862,5.895793
std,0.232492,0.221231,1.760819,0.759905,4.712281
min,0.0,0.0,1.0,0.1,0.11
25%,0.222222,0.333333,1.6,0.3,0.42
50%,0.416667,0.444444,4.4,1.3,5.72
75%,0.583333,0.611111,5.1,1.8,9.8
max,1.0,1.0,6.9,2.5,15.87


In [49]:
df.to_excel("preprocessed_iris_data.xlsx")