# Лабораторная работа №4
Выполнила: Богданова Ю.Н., гр.6233-010402D

# Преобразование данных в упорядоченную форму (tidy data)

Задание: Выяснить, насколько опрятны данные из 1 лр. Если не очень, попробовать привести к опрятному виду. Если уже опрятные – привести к "демонстрационному" виду (когда их удобно воспринимать в табличном виде ).

Что такое упорядоченные (tidy) данные?
Hadley в работе (http://vita.had.co.nz/papers/tidy-data.pdf) предлагает три основных принципа, определяющих, является ли набор данных упорядоченным:
1. Каждая переменная формирует столбец.
2. Каждое наблюдение формирует строку.
3. Каждый тип наблюдения формирует таблицу.

In [2]:
import numpy as np
import pandas as pd

## Описание данных

Источник: https://www.kaggle.com/datasets/abisheksudarshan/customer-segmentation  

Customer segmentation is the practice of dividing a customer base into groups of individuals that are similar in specific ways relevant to marketing, such as age, gender, interests and spending habits.  

|#|Attribute name|Attribute Information| Usage in model|
| :-: | :- | -: |:-|
|1|ID| Unique id (category)| not used |
|2|Gender|Gender (Binary) | used as 0/1|
|3|Ever_Married| Marital status of the customer (Binary)| used as 0/1 |
|4|Age| Customer Age (numerical)| used as it is |
|5|Graduated| Is the customer a graduate (Binary)|used as 0/1|
|6|Profession| Profession (Categorical)|used as dummies|
|7|Work_Experience| Work Experience in years (Numerical)|used as it is|
|8|Spending_Score| Spending score (Category)| used as it is |
|9|Family_Size| Amount of members in family (Numerical)|used as it is|  
|10|Var_1	| Anonymised Category for the customer (category), can be target|used as dummies|  
|11| __Target__: Segmentation| 4 segments (A, B, C, D )|used as it is|


In [3]:
df = pd.read_csv('CustomerSegmentation.csv')

In [4]:
df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [6]:
# В данных присутствуют пустые значения
df.isnull().sum()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [7]:
# Удаляем пустые значения
df.dropna(inplace=True)

In [10]:
df.memory_usage(deep=True).sum()

3114925

In [12]:
df.rename(columns={'Var_1' : 'Customer_category'}, inplace=True)

In [13]:
df.keys()

Index(['ID', 'Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Customer_category',
       'Segmentation'],
      dtype='object')

In [21]:
personal_df = df[['ID', 'Gender', 'Ever_Married', 'Age', 'Spending_Score', 'Family_Size']]
experience_df = df[['ID', 'Graduated', 'Profession', 'Work_Experience']]
target_df = df[['ID', 'Customer_category','Segmentation']]

In [44]:
personal_df.memory_usage(deep=True).sum() +\
experience_df.memory_usage(deep=True).sum() +\
target_df.memory_usage(deep=True).sum()

3328205

In [24]:
personal_df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Spending_Score,Family_Size
0,462809,Male,No,22,Low,4.0
2,466315,Female,Yes,67,Low,1.0
3,461735,Male,Yes,67,High,2.0
5,461319,Male,Yes,56,Average,2.0
6,460156,Male,No,32,Low,3.0


In [25]:
experience_df.head()

Unnamed: 0,ID,Graduated,Profession,Work_Experience
0,462809,No,Healthcare,1.0
2,466315,Yes,Engineer,1.0
3,461735,Yes,Lawyer,0.0
5,461319,No,Artist,0.0
6,460156,Yes,Healthcare,1.0


In [40]:
target_df.head()

Unnamed: 0,ID,Customer_category,Segmentation
0,462809,Cat_4,D
2,466315,Cat_6,B
3,461735,Cat_6,B
5,461319,Cat_6,C
6,460156,Cat_6,C


In [None]:
# Посмотрим на статистику

In [39]:
agg = personal_df.groupby(['Gender', 'Ever_Married', 'Spending_Score'])['Age', 'Family_Size'].agg(['mean']).astype(int)
agg

  agg = personal_df.groupby(['Gender', 'Ever_Married', 'Spending_Score'])['Age', 'Family_Size'].agg(['mean']).astype(int)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,Family_Size
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,mean
Gender,Ever_Married,Spending_Score,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,No,Low,33,2
Female,Yes,Average,47,3
Female,Yes,High,61,2
Female,Yes,Low,49,1
Male,No,Low,30,3
Male,Yes,Average,46,3
Male,Yes,High,56,3
Male,Yes,Low,52,2


In [26]:
agg1 = experience_df.groupby(['Graduated', 'Profession'])['Work_Experience'].agg(['mean', 'max', 'min']).astype(int)
agg1

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,max,min
Graduated,Profession,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,Artist,2,14,0
No,Doctor,2,14,0
No,Engineer,2,14,0
No,Entertainment,2,14,0
No,Executive,2,13,0
No,Healthcare,2,14,0
No,Homemaker,6,14,0
No,Lawyer,0,13,0
No,Marketing,1,11,0
Yes,Artist,2,14,0


In [34]:
agg2 = experience_df.groupby(['Graduated', 'Profession'])['Work_Experience'].agg(['mean']).astype(int)
agg2.unstack('Graduated')

Unnamed: 0_level_0,mean,mean
Graduated,No,Yes
Profession,Unnamed: 1_level_2,Unnamed: 2_level_2
Artist,2,2
Doctor,2,2
Engineer,2,2
Entertainment,2,2
Executive,2,2
Healthcare,2,3
Homemaker,6,6
Lawyer,0,1
Marketing,1,3
