# <u>**Part 1**



In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

In [2]:
data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}

df = pd.DataFrame(data)

### **Exercise 1**

check for the missing values for each column (in percent), round the result to two decimal places

In [3]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [4]:
np.round(df.isnull().sum()/len(df), 2)

size      0.17
color     0.00
gender    0.17
price     0.17
weight    0.33
bought    0.00
dtype: float64

### **Exercise 2**

Fill in the missing data for the column weight with the average value. Assign changes to the df DataFrame.



In [5]:
df['weight'].mean()

415.0

In [6]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['weight']] = imputer.fit_transform(df[['weight']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


### **Exercise 3**

Print the average value that was replaced with the missing values in the weight column.

In [7]:
imputer.statistics_[0]

415.0

### **Exercise 4**

fill in the missing data for the price column with a constant value = 99.0.Assign changes to the df DataFrame

In [8]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=99.0)
df[['price']] = imputer.fit_transform(df[['price']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


### **Exercise 5**

Fill in the missing values for the size column with the most frequent value of this column.

In [9]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[['size']] = imputer.fit_transform(df[['size']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,M,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


### **Exercise 6**

Using the original dataframe, first extract all rows of the df that don't have the value np.nan in column weight, then using this rows calculate the average value for all numeric columns

In [10]:
data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}

df = pd.DataFrame(data)

In [11]:
df[~df['weight'].isnull()].mean()

  """Entry point for launching an IPython kernel.


price     122.333333
weight    415.000000
dtype: float64

### **Exercise 7**

Extract columns of object type from this DataFrame. Then fill in all the missing values for these columns with the value 'empty'. Assign the result to the df_object variable

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    5 non-null      object 
 1   color   6 non-null      object 
 2   gender  5 non-null      object 
 3   price   5 non-null      float64
 4   weight  4 non-null      float64
 5   bought  6 non-null      object 
dtypes: float64(2), object(4)
memory usage: 416.0+ bytes


In [13]:
df_object = df.select_dtypes(include=['object']).fillna('empty')
df_object

Unnamed: 0,size,color,gender,bought
0,XL,red,female,yes
1,L,green,male,no
2,M,blue,empty,yes
3,empty,green,female,no
4,M,red,female,yes
5,M,green,male,no


### **Exercise 8**

The following dataframe is given

* df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83., 68.]})

Discretize the weight column. Divide the values of this column into three intervals of equal width. Assign the result to a new column 'weight_cut' as shown below.



In [14]:
df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83., 68.]})

In [15]:
df['weight_cut'] = pd.cut(df['weight'], bins=3)
df

Unnamed: 0,weight,weight_cut
0,75.0,"(67.977, 75.667]"
1,78.5,"(75.667, 83.333]"
2,85.0,"(83.333, 91.0]"
3,91.0,"(83.333, 91.0]"
4,84.5,"(83.333, 91.0]"
5,83.0,"(75.667, 83.333]"
6,68.0,"(67.977, 75.667]"


### **Exercise 9**

Using the same dataframe, discretize the column weight into three intervals with the given boundaries

* [60, 75]
* [75, 80]
* [80, 95]

Assign the result to a new column 'weight_cut'

In [16]:
df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83., 68.]})

In [17]:
df['weight_cut'] = pd.cut(df['weight'], bins=(60, 75, 80, 95))
df

Unnamed: 0,weight,weight_cut
0,75.0,"(60, 75]"
1,78.5,"(75, 80]"
2,85.0,"(80, 95]"
3,91.0,"(80, 95]"
4,84.5,"(80, 95]"
5,83.0,"(80, 95]"
6,68.0,"(60, 75]"


### **Exercise 10**

Discretize the column weight into three intervals with the given boundaries:

* [60, 75]
* [75, 80]
* [80, 95]

and bound to them the following labels

* light
* normal
* heavy

Assign the result to a new column 'weight_cut'

In [18]:
df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83., 68.]})

In [19]:
df['weight_cut'] = pd.cut(df['weight'], bins=(60, 75, 80, 95),
                          labels=['light', 'normal', 'heavy'])
df

Unnamed: 0,weight,weight_cut
0,75.0,light
1,78.5,normal
2,85.0,heavy
3,91.0,heavy
4,84.5,heavy
5,83.0,heavy
6,68.0,light
