![](https://lms.skillfactory.ru/assets/courseware/v1/824479af6a8c599b5138ae967c573e24/asset-v1:SkillFactory+DST-3.0+28FEB2021+type@asset+block/dst-eda-1-2.png) <br/><br/>
![](https://lms.skillfactory.ru/assets/courseware/v1/1470f05621fde7ab0edeee9fcae6ee19/asset-v1:SkillFactory+DST-3.0+28FEB2021+type@asset+block/dst-eda-1-1.png) <br/><br/>

## Methods
[1. FEATURE ENGINEERING](#feature-engineering)<br>
[2. FEATURE SELECTION](#feature-selection)<br>
[3. FEATURE ENCODING](#feature-encoding)<br>

In [None]:
import pandas as pd
import numpy  as np
import category_encoders as ce

import statistics
from sklearn import preprocessing
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport
import dtale

%matplotlib inline

In [None]:
data = pd.read_csv("./Data/wine.csv")
data.info()

In [None]:
# d = dtale.show(data)
# d
# profile = ProfileReport(data)
# profile

In [None]:
print(f"Number of tasters: {data['taster_name'].value_counts().count()}")
print(f"Max bottle price:  {data['price'].max()}")

# Data Cleaning

## Duplicates

In [None]:
columns = list(data.columns)
mask    = data.duplicated(subset=columns)

duplicates = data[mask]
print(f"Number of duplicates: {duplicates.shape[0]}")

data_dup = data.drop_duplicates(subset=columns)
print(f"Number of rows with duplicates removed: {data_dup.shape[0]}")

## Nans

In [None]:
sns.heatmap(data_dup.isnull())

In [None]:
data_drop = data_dup.copy()

thresh    = data_drop.shape[0] * 0.7
data_drop = data_drop.dropna(how='any', thresh=thresh, axis=1)

data_drop['designation'] = data_drop['designation'].fillna('unknown')
data_drop['region_1'] = data_drop['region_1'].fillna('unknown')
data_drop['taster_name'] = data_drop['taster_name'].fillna('unknown')
data_drop['taster_twitter_handle'] = data_drop['taster_twitter_handle'].fillna('unknown')

data_drop['country'] = data_drop['country'].fillna('US')
data_drop['price'] = data_drop['price'].fillna(data_drop['price'].mean())
data_drop['province'] = data_drop['province'].fillna('California')
data_drop['variety'] = data_drop['variety'].fillna('Pinot Noir')

# Statistics & EDA

In [None]:
data = pd.read_csv('./Data/wine_cleared.csv')
data.drop('Unnamed: 0', inplace=True, axis=1)

In [None]:
print(f"Min points:   {data['points'].min()}")
print(f"Min price:    {data['price'].min()}")
print(f"Mean price:   {round(statistics.mean(data['price']), 2)}")
print(f"Median price: {round(statistics.median(data['price']), 2)}")
print(f"Mode price:   {round(statistics.mode(data['price']), 2)}")

In [None]:
# Median
# lst = sorted(data['price'])
# mid  = len(lst) // 2
# median = (lst[mid] + lst[~mid]) / 2

# Mode
# freq = {}
# for price in data['price']:
#     if price in freq:
#         freq[price] += 1
#     else:
#         freq.setdefault(price, 1)
# print(max(freq, key=freq.get))

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)

stats.probplot(data['price'], plot=ax1)
ax2.hist(data['price'])

stats.probplot(data['points'], plot=ax3)
ax4.hist(data['points'])

fig.show()

In [None]:
# The correlation between continuous features that have a distribution close to normal can be calculated using the standard Pearson correlation.
data.corr() # Pearson Corr

In [None]:
#Spearman's correlation coefficient is used to calculate the relationships between categorical variables.
data.corr(method='spearman')

In [None]:
#Kendall's correlation coefficient is used to calculate the relationships between categorical variables.
data.corr(method='kendall')

In [None]:
from sklearn.metrics import matthews_corrcoef

# Matthews correlation coefficient is used to calculate the relationships between binary categorical variables.
x = [+1, -1, +1, +1] 
y = [+1, +1, +1, -1]
matthews_corrcoef(x, y) 

# Feature Engineering

In [None]:
data['price_round'] = data['price'].round().astype(int)

data['year'] = data['title'].str.findall('\d{4}').str.get(0)
data['year'] = pd.to_datetime(data['year'], errors='coerce')

data['locality'] = data['title'].str.findall('\((.*?)\)').str.get(0)

data['is_usa']    = data['country'].apply(lambda x: 1 if x == 'US' else 0)
data['is_italy']  = data['country'].apply(lambda x: 1 if x == 'Italy' else 0)
data['is_france'] = data['country'].apply(lambda x: 1 if x == 'France' else 0)

data['old_wine'] = data['year'].apply(lambda x: 1 if x.year < 2010 else 0)

In [None]:
population = pd.read_csv('Data/country_population.csv', sep=';')
data = data.join(population.set_index('country'), on='country')

In [None]:
area = pd.read_csv('Data/country_area.csv', sep=';')
data = data.join(area.set_index('country'), on='country')

In [None]:
data['years_diff'] = (pd.to_datetime('2022-01-12')  - data['year'])

# Feature Encoding

## Ordinal
If the attribute to be encoded is ordinal, use Ordinal Encoding.<br><br>
![](https://i.ibb.co/1b1b8WP/dst-eda-3-9.png)

In [None]:
clothing_list = [
    ['xxs', 'dress'],
    ['xxs', 'skirt'],
    ['xs', 'dress'],
    ['s', 'skirt'],
    ['m', 'dress'],
    ['l', 'shirt'],
    ['s', 'coat'],
    ['m', 'coat'],
    ['xxl', 'shirt'],
    ['l', 'dress']
]

clothing = pd.DataFrame(clothing_list, columns = ['size',  'type'])

ord_encoder = ce.OrdinalEncoder(cols=['size'])
data_bin = ord_encoder.fit_transform(clothing['size'])
clothing = pd.concat([clothing, data_bin], axis=1)

clothing

## One Hot
For nominal attributes, the number of unique attribute values is important, since with a large number of them, memory problems may occur. If the attribute has less than 15 values, One Hot Encoding should be selected for the data. The number 15 is chosen empirically — for your dataset, this number can be 20 or 10. It depends on the number of features in your dataset, the number of rows, and many other factors. If there are few features, then you can also use One Hot coding.<br><br>
![](https://i.ibb.co/HGZw1Yw/dst-eda-3-11.png)

In [None]:
encoder = ce.OneHotEncoder(cols=['type'], use_cat_names=True)
type_bin = encoder.fit_transform(clothing['type'])
clothing = pd.concat([clothing, type_bin], axis=1)

clothing

In [None]:
encoder = ce.OneHotEncoder(cols=['taster_name'], use_cat_names=True)
taster_bin = encoder.fit_transform(data['taster_name'])
data = pd.concat([data, taster_bin], axis=1)

## Binary
![](https://i.ibb.co/FwX5gYZ/dst-eda-3-14-copy.png)

In [None]:
bin_encoder = ce.BinaryEncoder(cols=['type'])
type_bin = bin_encoder.fit_transform(clothing['type'])
clothing = pd.concat([clothing, type_bin], axis=1)

clothing

In [None]:
encoder = ce.BinaryEncoder(cols=['country', 'taster_twitter_handle'])
bin = encoder.fit_transform(data[['country', 'taster_twitter_handle']])
data = pd.concat([data, bin], axis=1)

# Normalization & Standardization
Instructions for converting features:

- if the attribute is distributed normally, then it needs to be standardized;
- if the attribute is distributed abnormally, it must be normalized;
- if the spread of values is small, then you can do without data conversion.

In [None]:
mm_scaler = preprocessing.MinMaxScaler()
rb_scaler = preprocessing.RobustScaler()

mm_data = mm_scaler.fit_transform(data[['price']])
rb_data = rb_scaler.fit_transform(data[['price']])

data['price_mm_scaled'] = pd.DataFrame(mm_data)
data['price_rb_scaled'] = pd.DataFrame(rb_data)

In [None]:
s_scaler = preprocessing.StandardScaler()

s_data = s_scaler.fit_transform(data[['price']])

data['price_s_scaled'] = pd.DataFrame(s_data)

In [None]:
data['price_s_scaled'][129968]

# Feature Selection

## Correlations

In [None]:
fig = plt.figure(figsize=(40, 24))
axes = fig.add_axes([0, 0, 1, 1])
axes = sns.heatmap(data.corr(), annot = True, fmt='.2g', vmin=-1, vmax=1, center=0, square=True)
fig.show()

In [None]:
data = data.drop(['is_usa', 'is_france', 'is_italy', 'price_round', 'area'], axis=1)

### Visualization of correlations

In [None]:
datav = pd.read_csv('Data/model.csv')

In [None]:
fig = plt.figure(figsize=(10, 6))
axes = fig.add_axes([0, 0, 1, 1])
axes = sns.heatmap(datav.corr(), annot = True, fmt='.2g', vmin=-1, vmax=1, center=0, square=True)
fig.show()

In [None]:
fig = plt.figure(figsize=(10, 6))
axes = fig.add_axes([0, 0, 1, 1])
axes = sns.scatterplot(data=datav, x="Waist/Hip", y="Waist")
fig.show()

In [None]:
sns.pairplot(datav)

# Practice

In [None]:
heart = pd.read_csv('Data/heart.csv')

In [None]:
def get_trestbps(row):
    age = row['age']
    sex = row['sex']
    trestbps_mean = np.NaN
    
    if age <= 20:
        trestbps_mean = 123 if sex == 1 else 116
    elif age <= 30:
        trestbps_mean = 126 if sex == 1 else 120
    elif age <= 40:
        trestbps_mean = 129 if sex == 1 else 127
    elif age <= 50:
        trestbps_mean = 135 if sex == 1 else 137
    elif age <= 60:
        trestbps_mean = 142 if sex == 1 else 144
    else:
        trestbps_mean = 142 if sex == 1 else 115916
    return trestbps_mean

heart['old'] = heart['age'].apply(lambda v: 1 if v > 60 else 0)
heart['trestbps_mean'] = heart.apply(get_trestbps, axis=1)

In [None]:
encoder = ce.OneHotEncoder(cols=['cp', 'restecg', 'slope', 'ca', 'thal'])
bin = encoder.fit_transform(heart[['cp', 'restecg', 'slope', 'ca', 'thal']])
heart = pd.concat([heart, bin], axis=1)

In [None]:
rb_scaler = preprocessing.RobustScaler()

rb_data = rb_scaler.fit_transform(heart[['age', 'trestbps', 'chol', 'oldpeak', 'thalach']])

heart.drop(columns=['age', 'trestbps', 'chol', 'oldpeak', 'thalach'], axis=1, inplace=True)
scaled = pd.DataFrame(rb_data, columns=['age', 'trestbps', 'chol', 'oldpeak', 'thalach'])

heart = pd.concat([heart, scaled], axis=1)

In [None]:
fig = plt.figure(figsize=(40, 24))
axes = fig.add_axes([0, 0, 1, 1])
axes = sns.heatmap(heart.corr(), annot = True, fmt='.2g', vmin=-1, vmax=1, center=0, square=True)
fig.show()