#**Data Cleaning and Validation**

1. Remove duplicates and irrelevant columns.

2. Handle missing values appropriately.

3. Convert data types for columns such as dates and numerical values.

**Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
pd.set_option('display.max_columns', None) #display all columns (so that they are not hidden with «…»)

In [None]:
# Helpers function
def clean_text_soft(s: pd.Series) -> pd.Series: #аккуратно почистить текстовые столбцы (убрать «мусорные» невидимые символы, лишние пробелы)
    s = s.astype(str)
    s = s.str.replace(r'[\u200b\u200c\u200d\uFEFF\t\r]', '', regex=True)  # Удаляем невидимые символы
    s = s.str.strip().str.replace(r'\s+', ' ', regex=True)                # \s+ →' ' — схлопывает любые подряд идущие пробельные символы в один обычный пробел (включая множественные пробелы, табы, переводы строк)
    s = s.replace({'nan': np.nan})                                        # если строка "nan"
    return s

def to_numeric_smart(s: pd.Series) -> pd.Series: # преобразовать строки с числами в разных форматах (валюта, разделители тысяч, запятая/точка) в числа float
    s = s.astype(str)
    s = s.str.replace(r'[^\d,\.\-\u2212]', '', regex=True)   # убираем валюты и буквы
    s = s.str.replace(r'[\s\u00A0]', '', regex=True)         # пробелы
    s = s.str.replace('\u2212', '-', regex=False)            # юникод-минус
    s = s.str.replace(r'(?<=\d)\.(?=\d{3}(?:\D|$))', '', regex=True)  # тысячи-точки
    s = s.str.replace(r',(?=\d{2}$)', '.', regex=True)                # десятичная запятая
    return pd.to_numeric(s, errors='coerce')

##**1. Contacts**

In [None]:
# 1.1 Load and preview dataset
contacts = pd.read_excel('Contacts.xlsx', dtype={'Id': str})
contacts.head()

Unnamed: 0,Id,Contact Owner Name,Created Time,Modified Time
0,5805028000000645014,Rachel White,27.06.2023 11:28,22.12.2023 13:34
1,5805028000000872003,Charlie Davis,03.07.2023 11:31,21.05.2024 10:23
2,5805028000000889001,Bob Brown,02.07.2023 22:37,21.12.2023 13:17
3,5805028000000907006,Bob Brown,03.07.2023 05:44,29.12.2023 15:20
4,5805028000000939010,Nina Scott,04.07.2023 10:11,16.04.2024 16:14


In [None]:
contacts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18548 entries, 0 to 18547
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Id                  18548 non-null  object
 1   Contact Owner Name  18548 non-null  object
 2   Created Time        18548 non-null  object
 3   Modified Time       18548 non-null  object
dtypes: object(4)
memory usage: 579.8+ KB


In [None]:
# Getting a list of unique values in each column
unique_values = {col: contacts[col].unique() for col in contacts.columns}
for col, values in unique_values.items():
    print(f"\nUnique values in column '{col}':")
    print(values)
# Checking the number of unique values in each column
contacts.nunique()


Unique values in column 'Id':
['5805028000000645014' '5805028000000872003' '5805028000000889001' ...
 '5805028000056892018' '5805028000056892055' '5805028000056907001']

Unique values in column 'Contact Owner Name':
['Rachel White' 'Charlie Davis' 'Bob Brown' 'Nina Scott' 'Alice Johnson'
 'Ian Miller' 'Jane Smith' 'Julia Nelson' 'George King' 'Quincy Vincent'
 'Diana Evans' 'Kevin Parker' 'Ulysses Adams' 'Victor Barnes'
 'Yara Edwards' 'Paula Underwood' 'Mason Roberts' 'Ben Hall' 'Amy Green'
 'Cara Iverson' 'Oliver Taylor' 'Eva Kent' False 'Zachary Foster'
 'Sam Young' 'Wendy Clark' 'Tina Zhang' 'Derek James']

Unique values in column 'Created Time':
['27.06.2023 11:28' '03.07.2023 11:31' '02.07.2023 22:37' ...
 '21.06.2024 10:21' '21.06.2024 10:22' '21.06.2024 10:56']

Unique values in column 'Modified Time':
['22.12.2023 13:34' '21.05.2024 10:23' '21.12.2023 13:17' ...
 '21.06.2024 12:21' '21.06.2024 12:23' '21.06.2024 12:56']


Unnamed: 0,0
Id,18548
Contact Owner Name,28
Created Time,17921
Modified Time,16580


In [None]:
# 1.2 Drop duplicates
print('Contacts duplicated BEFORE:', contacts.duplicated().sum())
contacts = contacts.drop_duplicates(ignore_index=True)
print('Contacts duplicated AFTER:', contacts.duplicated().sum())

Contacts duplicated BEFORE: 0
Contacts duplicated AFTER: 0


In [None]:
# 1.3 Drop fully-empty rows
initial_rows = len(contacts)
contacts = contacts.dropna(how='all').reset_index(drop=True)
print(f'Contacts: removed empty rows: {initial_rows - len(contacts)}')

Contacts: removed empty rows: 0


In [None]:
# 1.4 Dates to datetime (Created / Modified)
contacts['Created Time'] = pd.to_datetime(contacts['Created Time'], format='%d.%m.%Y %H:%M', errors='coerce') #errors='coerce' → все проблемные значения заменяются на NaN
contacts['Modified Time'] = pd.to_datetime(contacts['Modified Time'], format='%d.%m.%Y %H:%M', errors='coerce')

In [None]:
contacts.head()

Unnamed: 0,Id,Contact Owner Name,Created Time,Modified Time
0,5805028000000645014,Rachel White,2023-06-27 11:28:00,2023-12-22 13:34:00
1,5805028000000872003,Charlie Davis,2023-07-03 11:31:00,2024-05-21 10:23:00
2,5805028000000889001,Bob Brown,2023-07-02 22:37:00,2023-12-21 13:17:00
3,5805028000000907006,Bob Brown,2023-07-03 05:44:00,2023-12-29 15:20:00
4,5805028000000939010,Nina Scott,2023-07-04 10:11:00,2024-04-16 16:14:00


In [None]:
# 1.5 Owner tidy + to category
contacts['Contact Owner Name'] = (contacts['Contact Owner Name'].pipe(clean_text_soft).replace({'False':'Unknown'}).astype('category'))

In [None]:
contacts.head()

Unnamed: 0,Id,Contact Owner Name,Created Time,Modified Time
0,5805028000000645014,Rachel White,2023-06-27 11:28:00,2023-12-22 13:34:00
1,5805028000000872003,Charlie Davis,2023-07-03 11:31:00,2024-05-21 10:23:00
2,5805028000000889001,Bob Brown,2023-07-02 22:37:00,2023-12-21 13:17:00
3,5805028000000907006,Bob Brown,2023-07-03 05:44:00,2023-12-29 15:20:00
4,5805028000000939010,Nina Scott,2023-07-04 10:11:00,2024-04-16 16:14:00


In [None]:
# 1.6 Analyzing the number of contacts.
contacts['Contact Owner Name'].value_counts()

Unnamed: 0_level_0,count
Contact Owner Name,Unnamed: 1_level_1
Charlie Davis,2018
Ulysses Adams,1816
Julia Nelson,1769
Paula Underwood,1487
Quincy Vincent,1416
Nina Scott,1150
Ben Hall,1038
Victor Barnes,967
Cara Iverson,880
Rachel White,782


In [None]:
# 1.7 Summary
display(contacts.head(10))


Unnamed: 0,Id,Contact Owner Name,Created Time,Modified Time
0,5805028000000645014,Rachel White,2023-06-27 11:28:00,2023-12-22 13:34:00
1,5805028000000872003,Charlie Davis,2023-07-03 11:31:00,2024-05-21 10:23:00
2,5805028000000889001,Bob Brown,2023-07-02 22:37:00,2023-12-21 13:17:00
3,5805028000000907006,Bob Brown,2023-07-03 05:44:00,2023-12-29 15:20:00
4,5805028000000939010,Nina Scott,2023-07-04 10:11:00,2024-04-16 16:14:00
5,5805028000000942003,Alice Johnson,2023-07-04 12:57:00,2023-07-17 19:43:00
6,5805028000000961001,Bob Brown,2023-07-03 20:17:00,2023-10-05 10:44:00
7,5805028000000964025,Ian Miller,2023-07-04 15:40:00,2024-06-11 18:40:00
8,5805028000000964068,Alice Johnson,2023-07-04 22:03:00,2023-07-17 19:43:00
9,5805028000000968001,Jane Smith,2023-07-03 20:39:00,2024-06-18 10:10:00


In [None]:
contacts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18548 entries, 0 to 18547
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Id                  18548 non-null  object        
 1   Contact Owner Name  18548 non-null  category      
 2   Created Time        18548 non-null  datetime64[ns]
 3   Modified Time       18548 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](2), object(1)
memory usage: 454.2+ KB


In [None]:
# 1.8 Save
contacts.to_excel('contacts_df.xlsx', index=False)
contacts.to_pickle("contacts_df.pkl")

###**Analysis and recommendations (Contacts)**

**Структура данных**

Содержит 4 поля: Id, Contact Owner Name, Created Time, Modified Time.

**Качество данных**

- Дубликаты и полностью пустые строки удалены.

- Created Time и Modified Time приведены к типу datetime.

- Contact Owner Name очищено от лишних пробелов и приведено к категориальному типу.

- Некорректные значения (False) заменены на 'Unknown'.

**Рекомендации**

- Таблица готова для дальнейшего анализа

- Можно анализировать распределение контактов по менеджерам и динамику их создания.

##**2. Calls**

In [None]:
# 2.1 Load and preview dataset
calls = pd.read_excel('Calls.xlsx', dtype={'Id': str, 'CONTACTID': str})
calls.head()

Unnamed: 0,Id,Call Start Time,Call Owner Name,CONTACTID,Call Type,Call Duration (in seconds),Call Status,Dialled Number,Outgoing Call Status,Scheduled in CRM,Tag
0,5805028000000805001,30.06.2023 08:43,John Doe,,Inbound,171.0,Received,,,,
1,5805028000000768006,30.06.2023 08:46,John Doe,,Outbound,28.0,Attended Dialled,,Completed,0.0,
2,5805028000000764027,30.06.2023 08:59,John Doe,,Outbound,24.0,Attended Dialled,,Completed,0.0,
3,5805028000000787003,30.06.2023 09:20,John Doe,5.805028000000645e+18,Outbound,6.0,Attended Dialled,,Completed,0.0,
4,5805028000000768019,30.06.2023 09:30,John Doe,5.805028000000645e+18,Outbound,11.0,Attended Dialled,,Completed,0.0,


In [None]:
calls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95874 entries, 0 to 95873
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Id                          95874 non-null  object 
 1   Call Start Time             95874 non-null  object 
 2   Call Owner Name             95874 non-null  object 
 3   CONTACTID                   91941 non-null  object 
 4   Call Type                   95874 non-null  object 
 5   Call Duration (in seconds)  95791 non-null  float64
 6   Call Status                 95874 non-null  object 
 7   Dialled Number              0 non-null      float64
 8   Outgoing Call Status        86875 non-null  object 
 9   Scheduled in CRM            86875 non-null  float64
 10  Tag                         0 non-null      float64
dtypes: float64(4), object(7)
memory usage: 8.0+ MB


In [None]:
# 2.2 Dedupe & empty rows
print('Calls duplicated BEFORE:', calls.duplicated().sum())
calls = calls.drop_duplicates(ignore_index=True)
print('Calls duplicated AFTER:', calls.duplicated().sum())

initial_rows = len(calls)
calls = calls.dropna(how='all').reset_index(drop=True)
print(f'Calls: removed empty rows: {initial_rows - len(calls)}')

Calls duplicated BEFORE: 0
Calls duplicated AFTER: 0
Calls: removed empty rows: 0


In [None]:
calls.describe()

Unnamed: 0,Call Duration (in seconds),Dialled Number,Scheduled in CRM,Tag
count,95791.0,0.0,86875.0,0.0
mean,164.977263,,0.001635,
std,401.410826,,0.040397,
min,0.0,,0.0,
25%,4.0,,0.0,
50%,8.0,,0.0,
75%,98.0,,0.0,
max,7625.0,,1.0,


In [None]:
# Getting a list of unique values in each column
unique_values = {col: calls[col].unique() for col in calls.columns}
for col, values in unique_values.items():
    print(f"\nUnique values in column '{col}':")
    print(values)
calls.nunique()


Unique values in column 'Id':
['5805028000000805001' '5805028000000768006' '5805028000000764027' ...
 '5805028000056832495' '5805028000056893619' '5805028000056893631']

Unique values in column 'Call Start Time':
['30.06.2023 08:43' '30.06.2023 08:46' '30.06.2023 08:59' ...
 '21.06.2024 15:29' '21.06.2024 15:30' '21.06.2024 15:31']

Unique values in column 'Call Owner Name':
['John Doe' 'Jane Smith' 'Alice Johnson' 'Bob Brown' 'Charlie Davis'
 'Diana Evans' 'Ethan Harris' 'Fiona Jackson' 'George King' 'Hannah Lee'
 'Ian Miller' 'Julia Nelson' 'Kevin Parker' 'Laura Quinn' 'Mason Roberts'
 'Nina Scott' 'Oliver Taylor' 'Paula Underwood' 'Quincy Vincent'
 'Rachel White' 'Sam Young' 'Tina Zhang' 'Ulysses Adams' 'Victor Barnes'
 'Wendy Clark' 'Xander Dean' 'Yara Edwards' 'Zachary Foster' 'Amy Green'
 'Ben Hall' 'Cara Iverson' 'Derek James' 'Eva Kent']

Unique values in column 'CONTACTID':
[nan '5805028000000645014' '5805028000000872003' ... '5805028000056727001'
 '5805028000056833185' '5805

Unnamed: 0,0
Id,95874
Call Start Time,68445
Call Owner Name,33
CONTACTID,15214
Call Type,3
Call Duration (in seconds),2619
Call Status,11
Dialled Number,0
Outgoing Call Status,4
Scheduled in CRM,2


In [None]:
# 2.3 Drop low-value columns, which do not carry any information
calls = calls.drop(columns=['Dialled Number', 'Tag'])

In [None]:
calls.dtypes

Unnamed: 0,0
Id,object
Call Start Time,object
Call Owner Name,object
CONTACTID,object
Call Type,object
Call Duration (in seconds),float64
Call Status,object
Outgoing Call Status,object
Scheduled in CRM,float64


In [None]:
# 2.4 Convert Call Start Time
calls['Call Start Time'] = pd.to_datetime(calls['Call Start Time'], format='%d.%m.%Y %H:%M', errors='coerce')

In [None]:
# 2.5 Convert categories
for col in ['Call Owner Name','Call Type','Call Status','Outgoing Call Status']:
    if col in calls.columns:
        calls[col] = calls[col].fillna('Unknown').astype('category')

In [None]:
# missing values flag for CONTACTID
calls['_missing_contact'] = calls['CONTACTID'].isna()

# summary statistics
print("\nCalls — missing CONTACTID:")
print(calls['_missing_contact'].value_counts())


Calls — missing CONTACTID:
_missing_contact
False    91941
True      3933
Name: count, dtype: int64


In [None]:
# 2.6 Clean the "Call Duration" column

# - convert to numeric
calls['Call Duration (in seconds)'] = to_numeric_smart(calls['Call Duration (in seconds)'])
# - fill missing values with 0 for zero-status calls, otherwise with median
zero_statuses = {'Unattended Dialled','Missed','Cancelled','Scheduled Unattended','Overdue','Scheduled Unattended Delay'}
med_dur = calls.loc[(calls['Call Duration (in seconds)']>0) & calls['Call Duration (in seconds)'].notna(), 'Call Duration (in seconds)'].median()

def _fill_dur(row):
    val = row['Call Duration (in seconds)']
    if pd.isna(val):
        return 0 if row['Call Status'] in zero_statuses else med_dur
    return val

# - cast the result to integer
calls['Call Duration (in seconds)'] = calls.apply(_fill_dur, axis=1).astype('int64', errors='ignore')

In [None]:
# 2.7 # Map "Scheduled in CRM": replace 0→'NO', 1→'Yes'
calls['Scheduled in CRM'] = calls['Scheduled in CRM'].replace({0:'NO',1:'Yes'})
calls['Scheduled in CRM'] = calls['Scheduled in CRM'].fillna('Unknown').astype('category')

In [None]:
# 2.8 Summary
display(calls.head(10))


Unnamed: 0,Id,Call Start Time,Call Owner Name,CONTACTID,Call Type,Call Duration (in seconds),Call Status,Outgoing Call Status,Scheduled in CRM,_missing_contact
0,5805028000000805001,2023-06-30 08:43:00,John Doe,,Inbound,171,Received,Unknown,Unknown,True
1,5805028000000768006,2023-06-30 08:46:00,John Doe,,Outbound,28,Attended Dialled,Completed,NO,True
2,5805028000000764027,2023-06-30 08:59:00,John Doe,,Outbound,24,Attended Dialled,Completed,NO,True
3,5805028000000787003,2023-06-30 09:20:00,John Doe,5.805028000000645e+18,Outbound,6,Attended Dialled,Completed,NO,False
4,5805028000000768019,2023-06-30 09:30:00,John Doe,5.805028000000645e+18,Outbound,11,Attended Dialled,Completed,NO,False
5,5805028000000790004,2023-06-30 12:09:00,John Doe,5.805028000000645e+18,Outbound,12,Attended Dialled,Completed,NO,False
6,5805028000000773022,2023-06-30 14:24:00,John Doe,5.805028000000645e+18,Outbound,4,Attended Dialled,Completed,NO,False
7,5805028000000879006,2023-07-03 13:06:00,Jane Smith,,Outbound,0,Unattended Dialled,Completed,NO,True
8,5805028000000870005,2023-07-03 13:08:00,Jane Smith,,Outbound,40,Attended Dialled,Completed,NO,True
9,5805028000000971050,2023-07-04 12:36:00,Jane Smith,,Outbound,0,Unattended Dialled,Completed,NO,True


In [None]:
calls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95874 entries, 0 to 95873
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Id                          95874 non-null  object        
 1   Call Start Time             95874 non-null  datetime64[ns]
 2   Call Owner Name             95874 non-null  category      
 3   CONTACTID                   91941 non-null  object        
 4   Call Type                   95874 non-null  category      
 5   Call Duration (in seconds)  95874 non-null  int64         
 6   Call Status                 95874 non-null  category      
 7   Outgoing Call Status        95874 non-null  category      
 8   Scheduled in CRM            95874 non-null  category      
 9   _missing_contact            95874 non-null  bool          
dtypes: bool(1), category(5), datetime64[ns](1), int64(1), object(2)
memory usage: 3.5+ MB


In [None]:
# Quick report on missing values and number of categories
print("\nCalls — NaN summary in key fields:")
for col in ['CONTACTID','Call Start Time','Outgoing Call Status','Scheduled in CRM']:
    if col in calls.columns:
        print(f"{col}: NaN = {calls[col].isna().sum()}")

print("\nCalls — category cardinality:")
for col in ['Call Owner Name','Call Type','Call Status','Outgoing Call Status','Scheduled in CRM']:
    if col in calls.columns:
        print(f"{col}: {calls[col].nunique()} unique")


Calls — NaN summary in key fields:
CONTACTID: NaN = 3933
Call Start Time: NaN = 0
Outgoing Call Status: NaN = 0
Scheduled in CRM: NaN = 0

Calls — category cardinality:
Call Owner Name: 33 unique
Call Type: 3 unique
Call Status: 11 unique
Outgoing Call Status: 5 unique
Scheduled in CRM: 3 unique


In [None]:
# 2.9 Save
calls.to_excel('calls_df.xlsx', index=False)
calls.to_pickle("calls_df.pkl")

**Analysis and recommendations (Calls)**

**Структура данных**

Ключевые поля: Id, CONTACTID, Call Start Time, Call Owner Name, Call Type, Call Status, Call Duration (in seconds), Outgoing Call Status, Scheduled in CRM.

**Качество данных**

- Дубликаты и пустые строки удалены.

- Call Start Time приведено к datetime.

- Все категориальные поля (Call Owner Name, Call Type, Call Status, Outgoing Call Status, Scheduled in CRM) приведены к типу category, пропуски заменены на 'Unknown'.

- Scheduled in CRM: нормализация 0/1 → NO/Yes.

- Call Duration (in seconds): числа очищены и приведены к int; пропуски заменены:

-- 0 для статусов, где длительность не имеет смысла (Missed, Cancelled и др.),

-- медиана по остальным звонкам.

- Добавлен флаг _missing_contact для звонков без CONTACTID.

**Рекомендации**

- Таблица пригодна для анализа динамики звонков по времени и распределения по типам/статусам.

- При сквозной аналитике использовать только строки с заполненным CONTACTID.

- Для join с Contacts/Deals использовать только False. Но при анализе нагрузки по менеджерам учитывать и такие звонки.

##**3. Spend**

In [None]:
# 3.1 Load and preview dataset
spend = pd.read_excel('Spend.xlsx', dtype={'Id': 'str'})
spend.head()

Unnamed: 0,Date,Source,Campaign,Impressions,Spend,Clicks,AdGroup,Ad
0,2023-07-03,Google Ads,gen_analyst_DE,6,0.0,0,,
1,2023-07-03,Google Ads,performancemax_eng_DE,4,0.01,1,,
2,2023-07-03,Facebook Ads,,0,0.0,0,,
3,2023-07-03,Google Ads,,0,0.0,0,,
4,2023-07-03,CRM,,0,0.0,0,,


In [None]:
spend.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20779 entries, 0 to 20778
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         20779 non-null  datetime64[ns]
 1   Source       20779 non-null  object        
 2   Campaign     14785 non-null  object        
 3   Impressions  20779 non-null  int64         
 4   Spend        20779 non-null  float64       
 5   Clicks       20779 non-null  int64         
 6   AdGroup      13951 non-null  object        
 7   Ad           13951 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 1.3+ MB


In [None]:
# 3.2 Dedupe & drop empty rows
print('Spend duplicated BEFORE:', spend.duplicated().sum())
spend = spend.drop_duplicates(ignore_index=True)
print('Spend duplicated AFTER:', spend.duplicated().sum())

initial_rows = len(spend)
spend = spend.dropna(how='all').reset_index(drop=True)
print(f'Spend: removed empty rows: {initial_rows - len(spend)}')

Spend duplicated BEFORE: 917
Spend duplicated AFTER: 0
Spend: removed empty rows: 0


In [None]:
# Getting a list of unique values in each column
unique_values = { col: spend[col].unique() for col in spend.columns }
for col, values in unique_values.items():
    print(f"\nUnique values in column '{col}':")
    print(values)
spend.nunique()


Unique values in column 'Date':
<DatetimeArray>
['2023-07-03 00:00:00', '2023-07-04 00:00:00', '2023-07-05 00:00:00',
 '2023-07-06 00:00:00', '2023-07-07 00:00:00', '2023-07-08 00:00:00',
 '2023-07-09 00:00:00', '2023-07-10 00:00:00', '2023-07-11 00:00:00',
 '2023-07-12 00:00:00',
 ...
 '2024-06-12 00:00:00', '2024-06-13 00:00:00', '2024-06-14 00:00:00',
 '2024-06-15 00:00:00', '2024-06-16 00:00:00', '2024-06-17 00:00:00',
 '2024-06-18 00:00:00', '2024-06-19 00:00:00', '2024-06-20 00:00:00',
 '2024-06-21 00:00:00']
Length: 355, dtype: datetime64[ns]

Unique values in column 'Source':
['Google Ads' 'Facebook Ads' 'CRM' 'Bloggers' 'Youtube Ads' 'SMM'
 'Tiktok Ads' 'Organic' 'Telegram posts' 'Webinar' 'Offline' 'Partnership'
 'Test' 'Radio']

Unique values in column 'Campaign':
['gen_analyst_DE' 'performancemax_eng_DE' nan '03.07.23women'
 '02.07.23wide_DE' '12.07.2023wide_DE' '05.07.23interests_DE'
 '04.07.23recentlymoved_DE' '07.07.23LAL_DE' '10.07.23wide_com_DE'
 '15.07.23b_DE' 'youtu

Unnamed: 0,0
Date,355
Source,14
Campaign,51
Impressions,4003
Spend,2859
Clicks,552
AdGroup,24
Ad,176


In [None]:
# 3.3 Convert date
spend['Date'] = pd.to_datetime(spend['Date'], errors='coerce', dayfirst=True)

In [None]:
# 3.4 Existing  categories
spend['Campaign'] = spend['Campaign'].fillna('Unknown').astype('category')
spend['AdGroup'] = spend['AdGroup'].fillna('Unknown').astype('category')
spend['Ad'] = spend['Ad'].fillna('Unknown').astype('category')
spend['Source'] = spend['Source'].astype('category')

In [None]:
# 3.5 Numeric columns
for col in ['Impressions','Clicks','Spend']:
    if col in spend.columns:
        spend[col] = to_numeric_smart(spend[col])

In [None]:
# 3.6 Summary
display(spend.head(10))


Unnamed: 0,Date,Source,Campaign,Impressions,Spend,Clicks,AdGroup,Ad
0,2023-07-03,Google Ads,gen_analyst_DE,6,0.0,0,Unknown,Unknown
1,2023-07-03,Google Ads,performancemax_eng_DE,4,0.01,1,Unknown,Unknown
2,2023-07-03,Facebook Ads,Unknown,0,0.0,0,Unknown,Unknown
3,2023-07-03,Google Ads,Unknown,0,0.0,0,Unknown,Unknown
4,2023-07-03,CRM,Unknown,0,0.0,0,Unknown,Unknown
5,2023-07-03,Facebook Ads,03.07.23women,187,3.3,6,women,b3
6,2023-07-03,Facebook Ads,03.07.23women,4,0.02,1,women,b1
7,2023-07-03,Bloggers,Unknown,0,0.0,0,Unknown,Unknown
8,2023-07-03,Youtube Ads,Unknown,0,0.0,0,Unknown,Unknown
9,2023-07-03,Facebook Ads,02.07.23wide_DE,61,0.58,0,wide,b4


In [None]:
spend.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19862 entries, 0 to 19861
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         19862 non-null  datetime64[ns]
 1   Source       19862 non-null  category      
 2   Campaign     19862 non-null  category      
 3   Impressions  19862 non-null  int64         
 4   Spend        19862 non-null  float64       
 5   Clicks       19862 non-null  int64         
 6   AdGroup      19862 non-null  category      
 7   Ad           19862 non-null  category      
dtypes: category(4), datetime64[ns](1), float64(1), int64(2)
memory usage: 727.1 KB


In [None]:
# 3.7 Save
spend.to_excel('spend_df.xlsx', index=False)
spend.to_pickle("spend_df.pkl")

###**Analysis and recommendations (Spend)**

**Структура данных**

Ключевые поля: Date, Source, Campaign, Impressions, Clicks, Spend, а также AdGroup и Ad.

**Качество данных**

- Дубликаты и пустые строки удалены.

- Date приведено к datetime.

- Категориальные (Campaign, Source, AdGroup, Ad) очищены и приведены к типу category; пропуски заменены на 'Unknown'.

- Числовые поля (Impressions, Clicks, Spend) очищены от лишних символов и приведены к числам.

**Рекомендации**

- Таблица готова для анализа эффективности кампаний, источников и креативов.

- Категория 'Unknown' даёт возможность видеть долю неполных данных.

##**4. Deals**

In [None]:
# 4.1 Load and preview dataset
deals = pd.read_excel('Deals.xlsx', dtype={'Id': str, 'Contact Name': str})
deals.head()

Unnamed: 0,Id,Deal Owner Name,Closing Date,Quality,Stage,Lost Reason,Page,Campaign,SLA,Content,Term,Source,Payment Type,Product,Education Type,Created Time,Course duration,Months of study,Initial Amount Paid,Offer Total Amount,Contact Name,City,Level of Deutsch
0,5805028000056864695,Ben Hall,,,New Lead,,/eng/test,03.07.23women,,v16,women,Facebook Ads,,,,21.06.2024 15:30,,,,,5805028000056849495,,
1,5805028000056859489,Ulysses Adams,,,New Lead,,/at-eng,,,,,Organic,,Web Developer,Morning,21.06.2024 15:23,6.0,,0.0,2000.0,5805028000056834471,,
2,5805028000056832357,Ulysses Adams,21.06.2024,D - Non Target,Lost,Non target,/at-eng,engwien_AT,00:26:43,b1-at,21_06_2024,Telegram posts,,,,21.06.2024 14:45,,,,,5805028000056854421,,
3,5805028000056824246,Eva Kent,21.06.2024,E - Non Qualified,Lost,Invalid number,/eng,04.07.23recentlymoved_DE,01:00:04,bloggersvideo14com,recentlymoved,Facebook Ads,,,,21.06.2024 13:32,,,,,5805028000056889351,,
4,5805028000056873292,Ben Hall,21.06.2024,D - Non Target,Lost,Non target,/eng,discovery_DE,00:53:12,website,,Google Ads,,,,21.06.2024 13:21,,,,,5805028000056876176,,


In [None]:
deals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21595 entries, 0 to 21594
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   21593 non-null  object 
 1   Deal Owner Name      21564 non-null  object 
 2   Closing Date         14645 non-null  object 
 3   Quality              19340 non-null  object 
 4   Stage                21593 non-null  object 
 5   Lost Reason          16124 non-null  object 
 6   Page                 21593 non-null  object 
 7   Campaign             16067 non-null  object 
 8   SLA                  15533 non-null  object 
 9   Content              14147 non-null  object 
 10  Term                 12454 non-null  object 
 11  Source               21593 non-null  object 
 12  Payment Type         496 non-null    object 
 13  Product              3592 non-null   object 
 14  Education Type       3300 non-null   object 
 15  Created Time         21593 non-null 

In [None]:
# 4.2 Remove duplicates & empty rows
print('Deals duplicated BEFORE:', deals.duplicated().sum())
deals = deals.drop_duplicates(ignore_index=True)
print('Deals duplicated AFTER:', deals.duplicated().sum())

initial_rows = len(deals)
deals = deals.dropna(how='all').reset_index(drop=True)
print(f'Deals: removed empty rows: {initial_rows - len(deals)}')

Deals duplicated BEFORE: 0
Deals duplicated AFTER: 0
Deals: removed empty rows: 1


In [None]:
deals.isna().sum()

Unnamed: 0,0
Id,1
Deal Owner Name,30
Closing Date,6949
Quality,2254
Stage,1
Lost Reason,5470
Page,1
Campaign,5527
SLA,6061
Content,7447


In [None]:
# 4.3 Drop unused columns
deals= deals.drop(columns=['Page'], errors="coerce")

display(deals.head())

Unnamed: 0,Id,Deal Owner Name,Closing Date,Quality,Stage,Lost Reason,Campaign,SLA,Content,Term,Source,Payment Type,Product,Education Type,Created Time,Course duration,Months of study,Initial Amount Paid,Offer Total Amount,Contact Name,City,Level of Deutsch
0,5805028000056864695,Ben Hall,,,New Lead,,03.07.23women,,v16,women,Facebook Ads,,,,21.06.2024 15:30,,,,,5805028000056849495,,
1,5805028000056859489,Ulysses Adams,,,New Lead,,,,,,Organic,,Web Developer,Morning,21.06.2024 15:23,6.0,,0.0,2000.0,5805028000056834471,,
2,5805028000056832357,Ulysses Adams,21.06.2024,D - Non Target,Lost,Non target,engwien_AT,00:26:43,b1-at,21_06_2024,Telegram posts,,,,21.06.2024 14:45,,,,,5805028000056854421,,
3,5805028000056824246,Eva Kent,21.06.2024,E - Non Qualified,Lost,Invalid number,04.07.23recentlymoved_DE,01:00:04,bloggersvideo14com,recentlymoved,Facebook Ads,,,,21.06.2024 13:32,,,,,5805028000056889351,,
4,5805028000056873292,Ben Hall,21.06.2024,D - Non Target,Lost,Non target,discovery_DE,00:53:12,website,,Google Ads,,,,21.06.2024 13:21,,,,,5805028000056876176,,


In [None]:
# 4.4 Categorical fields (fill NaN -> 'Unknown')
for col in ['Quality','Stage','Lost Reason','Campaign','Product', 'Education Type','City','Level of Deutsch','Source', 'Payment Type','Deal Owner Name', 'Term', 'Content']:
    if col in deals.columns:
        deals[col] = (deals[col].fillna('Unknown').pipe(clean_text_soft).astype('category'))

In [None]:
# 4.5 Remove invalid entries in Education Type
deals = deals[deals['Education Type'] != '#REF!'].copy()

In [None]:
# 4.6 Dates
deals['Created Time'] = pd.to_datetime(deals['Created Time'], format='%d.%m.%Y %H:%M', errors='coerce')
deals['Closing Date'] = pd.to_datetime(deals['Closing Date'], format='%d.%m.%Y', errors='coerce')

# Flag for open deals (no Closing Date)
deals['_open_deal'] = deals['Closing Date'].isna()
print("\nDeals — open vs closed:")
print(deals['_open_deal'].value_counts())


Deals — open vs closed:
_open_deal
False    14645
True      6948
Name: count, dtype: int64


In [None]:
# 4.7 Course duration / Months of study → fillna(0), Int8
for col in ['Course duration','Months of study']:
    if col in deals.columns:
        deals[col] = to_numeric_smart(deals[col]).fillna(0).astype('Int8')

In [None]:
# 4.8 Amount fields → float
for col in ['Initial Amount Paid','Offer Total Amount']:
    if col in deals.columns:
        deals[col] = to_numeric_smart(deals[col]).astype('float64')

In [None]:
deals

Unnamed: 0,Id,Deal Owner Name,Closing Date,Quality,Stage,Lost Reason,Campaign,SLA,Content,Term,Source,Payment Type,Product,Education Type,Created Time,Course duration,Months of study,Initial Amount Paid,Offer Total Amount,Contact Name,City,Level of Deutsch,_open_deal
0,5805028000056864695,Ben Hall,NaT,Unknown,New Lead,Unknown,03.07.23women,,v16,women,Facebook Ads,Unknown,Unknown,Unknown,2024-06-21 15:30:00,0,0,,,5805028000056849495,Unknown,Unknown,True
1,5805028000056859489,Ulysses Adams,NaT,Unknown,New Lead,Unknown,Unknown,,Unknown,Unknown,Organic,Unknown,Web Developer,Morning,2024-06-21 15:23:00,6,0,0.0,2000.0,5805028000056834471,Unknown,Unknown,True
2,5805028000056832357,Ulysses Adams,2024-06-21,D - Non Target,Lost,Non target,engwien_AT,00:26:43,b1-at,21_06_2024,Telegram posts,Unknown,Unknown,Unknown,2024-06-21 14:45:00,0,0,,,5805028000056854421,Unknown,Unknown,False
3,5805028000056824246,Eva Kent,2024-06-21,E - Non Qualified,Lost,Invalid number,04.07.23recentlymoved_DE,01:00:04,bloggersvideo14com,recentlymoved,Facebook Ads,Unknown,Unknown,Unknown,2024-06-21 13:32:00,0,0,,,5805028000056889351,Unknown,Unknown,False
4,5805028000056873292,Ben Hall,2024-06-21,D - Non Target,Lost,Non target,discovery_DE,00:53:12,website,Unknown,Google Ads,Unknown,Unknown,Unknown,2024-06-21 13:21:00,0,0,,,5805028000056876176,Unknown,Unknown,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21588,5805028000000970006,Jane Smith,2023-07-04,E - Non Qualified,Lost,Duplicate,03.07.23women,,b3,women,Facebook Ads,Unknown,Unknown,Unknown,2023-07-04 07:10:00,0,0,,,5805028000000979006,Unknown,Unknown,False
21589,5805028000000948010,Jane Smith,2023-08-29,B - Medium,Lost,needs time to think,03.07.23women,,b3,women,Facebook Ads,Unknown,Unknown,Unknown,2023-07-04 07:10:00,0,0,,,5805028000000979006,Unknown,Unknown,False
21590,5805028000000945016,Jane Smith,2023-08-29,A - High,Lost,Changed Decision,02.07.23wide_DE,"56 days, 19:01:59",b3,wide,Facebook Ads,Unknown,Unknown,Unknown,2023-07-03 20:39:00,0,0,,,5805028000000968001,Unknown,Unknown,False
21591,5805028000000927004,Bob Brown,2023-07-09,D - Non Target,Lost,Does not speak English,03.07.23women,,b3,women,Facebook Ads,Unknown,Unknown,Unknown,2023-07-03 20:17:00,0,0,,,5805028000000961001,Unknown,Unknown,False


In [None]:
# 4.8.1 Payment/Offer Categories
def classify_payment(x):
    if pd.isna(x):
        return "Unknown"        # данные отсутствуют
    elif x == 0:
        return "No Payment"     # нет платежа
    elif x < 10:                # символическая плата (демо-доступ)
        return "Demo Access"
    else:
        return "Regular Payment"

deals['Payment Category'] = deals['Initial Amount Paid'].apply(classify_payment)

# Classify Offer Total Amount
def classify_offer(x):
    if pd.isna(x):
        return "Unknown"
    elif x == 0:
        return "No Offer"
    elif x < 10:
        return "Demo Access"
    else:
        return "Regular Offer"

deals['Offer Category'] = deals['Offer Total Amount'].apply(classify_offer)


- Создаёт новые категориальные признаки на основе полей `Initial Amount Paid` и `Offer Total Amount`.
- Классификация строится по следующим правилам:
  - **Unknown** → значение отсутствует (`NaN`), данные не внесены в CRM.
  - **No Payment / No Offer** → значение равно `0`, то есть клиент ничего не заплатил или не получил предложение.
  - **Demo Access** → символическая сумма (меньше 10), например 1, 9 — это демо-доступ или тестовая оплата.
  - **Regular Payment / Regular Offer** → значения больше 10, то есть стандартные платежи и офферы.  

**Зачем:**
- Это позволяет разделить клиентов на разные категории и анализировать:
  - долю сделок без оплат,
  - долю демо-студентов,
  - долю полноценных платёжных клиентов,
  - долю сделок, где в CRM отсутствуют данные.  
- Такая категоризация делает данные более интерпретируемыми и полезными для анализа воронки и качества лидов.  

**Итог:**
- В таблице появились новые столбцы:  
  - `Payment Category`  
  - `Offer Category`  
- Теперь можно строить графики и отчёты не только по суммам, но и по категориям


In [None]:
# 4.9 Fill Payment Done with median if zero/NaN
# for rows where Stage == 'Payment Done' and amounts <= 0, substitute the median of this group (robust to outliers)(устойчиво к выбросам)
if 'Stage' in deals.columns:
    mask_paid = deals['Stage'].astype(str).str.strip().eq('Payment Done')

    # Initial Amount Paid
    if 'Initial Amount Paid' in deals.columns:
        med_init = deals.loc[mask_paid & (deals['Initial Amount Paid'] > 0), 'Initial Amount Paid'].median()
        # replace only zeros; keep NaN values
        deals.loc[mask_paid & (deals['Initial Amount Paid'] == 0), 'Initial Amount Paid'] = med_init

    # Offer Total Amount
    if 'Offer Total Amount' in deals.columns:
        med_offer = deals.loc[mask_paid & (deals['Offer Total Amount'] > 0), 'Offer Total Amount'].median()
        deals.loc[mask_paid & (deals['Offer Total Amount'] == 0), 'Offer Total Amount'] = med_offer

- Проверяет все сделки на стадии **"Payment Done"** (оплата проведена).
- Если в таких сделках сумма платежа или сумма предложения (`Initial Amount Paid`, `Offer Total Amount`) равна **0**, то это считается ошибкой данных (ведь у оплаченной сделки не может быть нуля).
- В этих случаях мы подставляем **медианное значение** по всем корректным сделкам с оплатой.  
   Медиана выбрана, потому что она устойчива к выбросам и лучше отражает «типичный» платёж, чем среднее.  

**важно:**
- Значения `NaN` **оставляем** без изменений.  
  Это даёт возможность в аналитике показать:
  - сколько сделок имеют корректные суммы,
  - сколько записей ошибочно содержат 0,
  - сколько сделок вообще не содержат данных об оплате.  

**Почему так:**
- Если заменить `NaN` тоже, то потеряем возможность показать проблемы CRM (например, когда сделка помечена как оплаченная, но суммы просто не внесены).   

**Итог:**
- Ошибки «0 у Payment Done» исправлены → данные стали чище.  
- `NaN` оставлены → можно проанализировать и качество работы CRM.  


In [None]:
# 4.10 SLA normalize -> hours and timedelta
if 'SLA' in deals.columns:
    deals['SLA'] = deals['SLA'].fillna(pd.Timedelta(seconds=0)).astype(str).replace({"null": np.nan})
    deals['SLA'] = pd.to_timedelta(deals['SLA'], errors='coerce')
    deals['_SLA_hours'] = deals['SLA'].dt.total_seconds() / 3600
    deals['SLA'] = pd.to_timedelta(deals['_SLA_hours'], unit='h')

In [None]:
 # 4.11 Normalize City
replace_city = {
    'Karl-Liebknecht str. 24, Hildburghausen, Thüringen': 'Thüringen',
    'Halle (Saale)': 'Halle',
    'Vor Ebersbach 1, 77761 Schiltach': 'Schiltach',
    'Poland , Gdansk , Al. Grunwaldzka 7, ap. 1a': 'Gdansk',
    '-': 'Unknown'
}
deals['City'] = deals['City'].astype(str).replace(replace_city).astype('category')

In [None]:
# 4.11 Normalize Level of Deutsch
if 'Level of Deutsch' in deals.columns:
    lvl = deals['Level of Deutsch'].astype(str).str.strip().str.upper()

    # Cyrillic → Latin (letters A/B/C/B often appear)
    # А, В, С, Б -> A, B, C, B
    trans = str.maketrans({
        'А': 'A', 'а': 'A',
        'В': 'B', 'в': 'B',
        'С': 'C', 'с': 'C',
        'Б': 'B', 'б': 'B'
    })
    lvl = lvl.apply(lambda x: x.translate(trans))

    # Convert everything to uppercase (B2 -> B2, b2 -> B2)
    lvl = lvl.str.upper()

    # Remove extraneous parts: spaces, symbols, words like "LEVEL", "+", etc
    # Keep only letters A/B/C and digits 1/2
    lvl = lvl.str.replace(r'[^ABC12]', '', regex=True)

    # Extract a valid pattern (exactly one letter A|B|C + digit 1|2)
    valid = lvl.str.extract(r'^([ABC][12])$', expand=False)

    # Assign the final value and data type
    deals['Level of Deutsch'] = valid.fillna('Unknown').astype('category')

    # (optional) inspect the distribution
    print("Level of Deutsch distribution:")
    print(deals['Level of Deutsch'].value_counts(dropna=False))


Level of Deutsch distribution:
Level of Deutsch
Unknown    20591
B1           694
B2           151
A2           110
C1            25
A1            19
C2             3
Name: count, dtype: int64


In [None]:
# 4.13 Summary
display(deals.head(10))


Unnamed: 0,Id,Deal Owner Name,Closing Date,Quality,Stage,Lost Reason,Campaign,SLA,Content,Term,Source,Payment Type,Product,Education Type,Created Time,Course duration,Months of study,Initial Amount Paid,Offer Total Amount,Contact Name,City,Level of Deutsch,_open_deal,Payment Category,Offer Category,_SLA_hours
0,5805028000056864695,Ben Hall,NaT,Unknown,New Lead,Unknown,03.07.23women,0 days 00:00:00,v16,women,Facebook Ads,Unknown,Unknown,Unknown,2024-06-21 15:30:00,0,0,,,5805028000056849495,Unknown,Unknown,True,Unknown,Unknown,0.0
1,5805028000056859489,Ulysses Adams,NaT,Unknown,New Lead,Unknown,Unknown,0 days 00:00:00,Unknown,Unknown,Organic,Unknown,Web Developer,Morning,2024-06-21 15:23:00,6,0,0.0,2000.0,5805028000056834471,Unknown,Unknown,True,No Payment,Regular Offer,0.0
2,5805028000056832357,Ulysses Adams,2024-06-21,D - Non Target,Lost,Non target,engwien_AT,0 days 00:26:43,b1-at,21_06_2024,Telegram posts,Unknown,Unknown,Unknown,2024-06-21 14:45:00,0,0,,,5805028000056854421,Unknown,Unknown,False,Unknown,Unknown,0.445278
3,5805028000056824246,Eva Kent,2024-06-21,E - Non Qualified,Lost,Invalid number,04.07.23recentlymoved_DE,0 days 01:00:03.999999999,bloggersvideo14com,recentlymoved,Facebook Ads,Unknown,Unknown,Unknown,2024-06-21 13:32:00,0,0,,,5805028000056889351,Unknown,Unknown,False,Unknown,Unknown,1.001111
4,5805028000056873292,Ben Hall,2024-06-21,D - Non Target,Lost,Non target,discovery_DE,0 days 00:53:12.000000001,website,Unknown,Google Ads,Unknown,Unknown,Unknown,2024-06-21 13:21:00,0,0,,,5805028000056876176,Unknown,Unknown,False,Unknown,Unknown,0.886667
5,5805028000056828429,Paula Underwood,NaT,Unknown,Need a consultation,Unknown,youtube_shorts_DE,0 days 01:33:10,bloggersvideo2june,Com_august,Youtube Ads,Unknown,Unknown,Unknown,2024-06-21 13:02:00,0,0,,,5805028000056833279,Unknown,Unknown,True,Unknown,Unknown,1.552778
6,5805028000056893379,Ulysses Adams,NaT,Unknown,Need To Call,Unknown,Unknown,0 days 00:00:00,Unknown,Unknown,Organic,Unknown,Unknown,Unknown,2024-06-21 12:52:00,0,0,,,5805028000056832215,Unknown,Unknown,True,Unknown,Unknown,0.0
7,5805028000056849262,Eva Kent,NaT,Unknown,Need a consultation,Unknown,brand_search_eng_DE,0 days 02:12:29.000000001,152789402780_{region_name}_695563281558,it career hub,Google Ads,Unknown,Unknown,Unknown,2024-06-21 12:44:00,0,0,,,5805028000056833242,Unknown,Unknown,True,Unknown,Unknown,2.208056
8,5805028000056845137,Rachel White,2024-06-21,E - Non Qualified,Lost,Duplicate,Unknown,0 days 00:00:00,Unknown,Unknown,Organic,Unknown,Unknown,Unknown,2024-06-21 12:40:00,0,0,,,5805028000056849237,Unknown,Unknown,False,Unknown,Unknown,0.0
9,5805028000056864442,Victor Barnes,2024-06-21,D - Non Target,Lost,Inadequate,1406start,0 days 00:10:08,Unknown,Unknown,CRM,Unknown,Unknown,Unknown,2024-06-21 12:39:00,0,0,,,5805028000039274091,Unknown,Unknown,False,Unknown,Unknown,0.168889


<class 'pandas.core.frame.DataFrame'>
Index: 21593 entries, 0 to 21592
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype          
---  ------               --------------  -----          
 0   Id                   21593 non-null  object         
 1   Deal Owner Name      21593 non-null  category       
 2   Closing Date         14645 non-null  datetime64[ns] 
 3   Quality              21593 non-null  category       
 4   Stage                21593 non-null  category       
 5   Lost Reason          21593 non-null  category       
 6   Campaign             21593 non-null  category       
 7   SLA                  21593 non-null  timedelta64[ns]
 8   Content              21593 non-null  category       
 9   Term                 21593 non-null  category       
 10  Source               21593 non-null  category       
 11  Payment Type         21593 non-null  category       
 12  Product              21593 non-null  category       
 13  Education Type       

In [None]:
deals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21593 entries, 0 to 21592
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype          
---  ------               --------------  -----          
 0   Id                   21593 non-null  object         
 1   Deal Owner Name      21593 non-null  category       
 2   Closing Date         14645 non-null  datetime64[ns] 
 3   Quality              21593 non-null  category       
 4   Stage                21593 non-null  category       
 5   Lost Reason          21593 non-null  category       
 6   Campaign             21593 non-null  category       
 7   SLA                  21593 non-null  timedelta64[ns]
 8   Content              21593 non-null  category       
 9   Term                 21593 non-null  category       
 10  Source               21593 non-null  category       
 11  Payment Type         21593 non-null  category       
 12  Product              21593 non-null  category       
 13  Education Type       

In [None]:
# 4.12 Save
deals.to_excel('deals_df.xlsx', index=False)
deals.to_pickle("deals_df.pkl")

In [None]:
print("✅ All DataFrames saved both to Excel (.xlsx) and Pickle (.pkl)")

✅ All DataFrames saved both to Excel (.xlsx) and Pickle (.pkl)


###**Analysis and recommendations (Deals)**

**Структура данных**

- Таблица Deals содержит ключевые бизнес-поля: ID сделки, владельца, стадии (Stage), качество (Quality), суммы (Initial Amount Paid, Offer Total Amount), даты (Created Time, Closing Date), SLA, уровень языка, продукт и пр.

**Дубликаты и пустые строки**

- Все дубликаты и полностью пустые строки удалены → в выборке каждая сделка представлена единожды.

**Даты**

- Created Time всегда заполнено → корректная база для анализа динамики.

- Closing Date может быть NaT (сделка ещё открыта). Это норма для CRM: такие записи помечены флагом _open_deal.

**Числовые поля**

- Course duration, Months of study приведены к целым числам.

- Суммы (Initial Amount Paid, Offer Total Amount) приведены к float, очищены от символов валюты и форматирования.

**Категориальные поля**

- Пропуски заменены на 'Unknown' (например, в Source, Payment Type, Deal Owner Name).

- Это позволяет сохранить все строки и корректно анализировать распределения.

- В City исправлены длинные адреса на названия городов

- Level of Deutsch нормализован: кириллица переведена в латиницу, регистр приведён к верхнему, лишние символы удалены. Допустимые уровни → A1/A2/B1/B2/C1/C2, всё остальное → 'Unknown'.

**SLA**

Поле SLA приведено к формату Timedelta, рассчитаны часы (_SLA_hours). Теперь можно анализировать время реакции менеджеров.

**Общие выводы по качеству данных**

- Таблица готова к использованию в дальнейших аналитических шагах (описательная статистика, временные ряды, анализ воронки).

- Наличие флагов _open_deal и _SLA_hours позволит гибко разделять активные и завершённые сделки и анализировать операционную эффективность.

- Категория 'Unknown' помогает явно учитывать пропуски, не теряя строки.

*В итоге структура связей такая:*

**Spend → Deals → Contacts → Calls**

**Spend** даёт рекламные затраты,

**Deals** показывает сделки (с привязкой к кампаниям и контактам),

**Contacts** связывает всё с конкретными людьми,

**Calls** фиксирует звонки с этими людьми.

**Почему не удаляем строки без CONTACTID?**

Чтобы не терять данные для операционных метрик по звонкам (нагрузка на менеджеров, распределение статусов). Для сквозной аналитики используем фильтр CONTACTID.notna().

**Почему в Closing Date оставляем NaT?**

Потому что открытые сделки не закрыты — это норма. Подставлять фиктивные даты нельзя: ломает длительности и тренды.

**Зачем делать категории (astype('category'))?**

Быстрее группировки/агрегации, меньше память, чёткая типизация категориальных признаков.

**Почему медиана, а не среднее для сумм?**

Медиана устойчивее к выбросам (одно большое значение не уводит результат).

###**Почему заполняем пропуски именно 'Unknown'?**

**1. Категориальные поля **(Stage, Quality, Source, Payment Type, City и т.п.)

- Если оставить NaN, то при группировке Pandas/Excel/BI-системы будет создавать отдельную «пустую» категорию, и часто такие строки будут выпадать из расчётов.

- Заполнив 'Unknown', мы сохраняем строку в выборке и явно говорим: «Информация отсутствует».

- Так проще анализировать: можно всегда видеть, какая доля данных не заполнена.

**2. Унификация**

- Вместо того чтобы часть пропусков была NaN, часть None, часть пустая строка '' → всё сведено в одно значение 'Unknown'.

- Это упрощает код (нет необходимости каждый раз проверять разные типы пропусков).

**3. Корректность джойнов**

- В join-ах NaN не сопоставляются ни с чем → такие строки теряются.

- 'Unknown' сопоставляется с 'Unknown' → в merge строки сохраняются.

**4. Прозрачность для бизнеса**

- При отчёте, вместо пустого поля написано 'Unknown', это лучше читается.

- Можно сразу объяснить: «Вот процент данных, где не указано».

**Важный нюанс**

- Для ключевых полей (например, Id, CONTACTID) мы не ставили 'Unknown', а оставляли NaN + добавляли флаг.
Потому что Unknown как ID может ввести в заблуждение при джойне.

- 'Unknown' — это стратегия именно для категориальных признаков, которые используются в аналитике (Stage, Source, City и т.п.).

**Логика**

- В **категориальных колонках** мы ставили 'Unknown', чтобы:

-- строки не терялись в анализе,

-- группировки работали корректно,

-- было видно долю пропусков.

- В **ключевых колонках** (Id, ContactID, Contact Name) мы NaN не трогали, чтобы не ломать джойны.

Напоминание для сквозной аналитики:

Deals.Term ⇔ Spend.AdGroup

Deals.Content ⇔ Spend.Ad