# Введение в разведочный анализ данных (Exploratory Data Analysis, EDA)

## Импорт библиотек и загрузка данных

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [37]:
DIAMONDS_DATASET = "https://raw.githubusercontent.com/aiedu-courses/stepik_eda_and_dev_tools/main/datasets/diamonds_good.csv"

In [38]:
df = pd.read_csv(DIAMONDS_DATASET)

## Описание данных и проверка корректности импорта

Тут описываем текстом все категории данных, входящих в датасет:
*   бла-бла-бла
*   блу-блу-блу
*   бле-бле-бле
*   ...

Размер датасета

In [42]:
df.shape

(53940, 10)

In [43]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,'x','y','z'
0,0.23,Ideal,E,SI2,61.5,55.0,326.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335.0,4.34,4.35,2.75


In [44]:
df.tail()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,'x','y','z'
53935,0.72,Ideal,D,SI1,60.8,57.0,2757.0,5.75,5.76,3.5
53936,0.72,Good,D,SI1,63.1,55.0,2757.0,5.69,5.75,3.61
53937,0.7,Very Good,D,SI1,62.8,60.0,2757.0,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757.0,6.15,6.12,3.74
53939,0.75,Ideal,D,SI2,62.2,55.0,2757.0,5.83,5.87,3.64


In [45]:
df.sample(3)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,'x','y','z'
31920,0.3,Very Good,G,VS1,63.1,56.0,776.0,4.26,4.24,2.68
28173,0.36,Premium,H,SI1,59.3,60.0,665.0,4.65,4.62,2.75
21112,1.5,Very Good,E,SI2,60.1,60.0,9281.0,7.35,7.42,4.44


## Исследование данных

Информация о типах данных и пропусках

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53442 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    52950 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  float64
 7   'x'      53940 non-null  float64
 8   'y'      53935 non-null  float64
 9   'z'      53940 non-null  float64
dtypes: float64(7), object(3)
memory usage: 4.1+ MB


Статистика числовых данных

In [47]:
df.describe()

Unnamed: 0,carat,depth,table,price,'x','y','z'
count,53442.0,52950.0,53940.0,53940.0,53940.0,53935.0,53940.0
mean,0.797561,61.749422,57.457184,3932.799722,5.731157,5.734469,3.538734
std,0.473534,1.431558,2.234491,3989.439738,1.121761,1.142125,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


Статистика категориальных данных

In [27]:
df.describe(include='object')

Unnamed: 0,cut,color,clarity
count,53805,53805,53805
unique,6,7,8
top,Ideal,G,SI1
freq,21465,11264,13035


Статистика данных (сводная)

In [29]:
df.describe(include='all')

Unnamed: 0,index,carat,cut,color,clarity,depth,table,price,'x','y','z'
count,53805.0,53307.0,53805,53805,53805,52815.0,53805.0,53805.0,53805.0,53800.0,53805.0
unique,,,6,7,8,,,,,,
top,,,Ideal,G,SI1,,,,,,
freq,,,21465,11264,13035,,,,,,
mean,26969.962661,0.797489,,,,61.748329,57.458089,3933.562829,5.731325,5.7347,3.5388
std,15576.241973,0.473102,,,,1.428931,2.233612,3988.926733,1.120912,1.1414,0.705205
min,0.0,0.2,,,,43.0,43.0,326.0,0.0,0.0,0.0
25%,13480.0,0.4,,,,61.0,56.0,951.0,4.71,4.72,2.91
50%,26960.0,0.7,,,,61.8,57.0,2401.0,5.7,5.71,3.53
75%,40470.0,1.04,,,,62.5,59.0,5327.0,6.54,6.54,4.03


### Работа с дубликатами

Подсчет количества дубликатов

In [49]:
df.duplicated().sum()

np.int64(135)

In [52]:
# можно через переменную, чтобы было красиво

num_duplicates = df.duplicated().sum()
print(f"Количество дубликатов: {num_duplicates}")

Количество дубликатов: 135


Просмотр дублирующихся строк

In [None]:
# df[условие фильтрации]

In [51]:
df[df.duplicated()]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,'x','y','z'
1005,0.79,Ideal,G,SI1,62.3,57.0,2898.0,5.90,5.85,3.66
1006,0.79,Ideal,G,SI1,62.3,57.0,2898.0,5.90,5.85,3.66
1007,0.79,Ideal,G,SI1,62.3,57.0,2898.0,5.90,5.85,3.66
1008,0.79,Ideal,G,SI1,62.3,57.0,2898.0,5.90,5.85,3.66
2025,1.52,Good,E,I1,57.3,58.0,3105.0,7.53,7.42,4.28
...,...,...,...,...,...,...,...,...,...,...
47969,0.52,Ideal,D,VS2,61.8,55.0,1919.0,5.19,5.16,3.20
49326,0.51,Ideal,F,VVS2,61.2,56.0,2093.0,5.17,5.19,3.17
49557,0.71,Good,F,SI2,64.1,60.0,2130.0,0.00,0.00,0.00
50079,0.51,Ideal,F,VVS2,61.2,56.0,2203.0,5.19,5.17,3.17


In [55]:
# через переменную разный формат вывода

duplicates = df[df.duplicated()]
print(duplicates)

       carat    cut color clarity  depth  table   price   'x'   'y'   'z'
1005    0.79  Ideal     G     SI1   62.3   57.0  2898.0  5.90  5.85  3.66
1006    0.79  Ideal     G     SI1   62.3   57.0  2898.0  5.90  5.85  3.66
1007    0.79  Ideal     G     SI1   62.3   57.0  2898.0  5.90  5.85  3.66
1008    0.79  Ideal     G     SI1   62.3   57.0  2898.0  5.90  5.85  3.66
2025    1.52   Good     E      I1   57.3   58.0  3105.0  7.53  7.42  4.28
...      ...    ...   ...     ...    ...    ...     ...   ...   ...   ...
47969   0.52  Ideal     D     VS2   61.8   55.0  1919.0  5.19  5.16  3.20
49326   0.51  Ideal     F    VVS2   61.2   56.0  2093.0  5.17  5.19  3.17
49557   0.71   Good     F     SI2   64.1   60.0  2130.0  0.00  0.00  0.00
50079   0.51  Ideal     F    VVS2   61.2   56.0  2203.0  5.19  5.17  3.17
52861   0.50   Fair     E     VS2   79.0   73.0  2579.0  5.21  5.18  4.09

[135 rows x 10 columns]


In [56]:
# df = df.drop_duplicates()

df.drop_duplicates(inplace = True)

df.shape

(53805, 10)

Функция `df.reset_index()` нужна для обновления индексов после группировок, фильтраций, удаления. Старые индексы ставит в отдельную колонку index по умолчанию. Но можно изменить это условие.

Полезное:

[pandas.DataFrame.reset_index — pandas 2.3.3 documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html)

[python - What is the use of reset_index() in pandas? - Stack Overflow](https://stackoverflow.com/questions/58909624/what-is-the-use-of-reset-index-in-pandas)

In [60]:
df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,index,carat,cut,color,clarity,depth,table,price,'x','y','z'
0,0,0.23,Ideal,E,SI2,61.5,55.0,326.0,3.95,3.98,2.43
1,1,0.21,Premium,E,SI1,59.8,61.0,326.0,3.89,3.84,2.31
2,2,0.23,Good,E,VS1,56.9,65.0,327.0,4.05,4.07,2.31
3,3,0.29,Premium,I,VS2,62.4,58.0,334.0,4.2,4.23,2.63
4,4,0.31,Good,J,SI2,63.3,58.0,335.0,4.34,4.35,2.75


### Работа с пропусками

In [21]:
df.isnull().sum().to_frame().T

Unnamed: 0,index,carat,cut,color,clarity,depth,table,price,'x','y','z'
0,0,498,0,0,0,990,0,0,0,5,0


In [22]:
((df.isnull().sum() / len(df)) * 100).to_frame().T

Unnamed: 0,index,carat,cut,color,clarity,depth,table,price,'x','y','z'
0,0.0,0.925565,0.0,0.0,0.0,1.839978,0.0,0.0,0.0,0.009293,0.0


In [23]:
df_clean = df.dropna()

In [24]:
df_clean.isnull().sum().to_frame().T

Unnamed: 0,index,carat,cut,color,clarity,depth,table,price,'x','y','z'
0,0,0,0,0,0,0,0,0,0,0,0
