### **Обработка данных**

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import pickle

from matplotlib.ticker import FormatStrFormatter
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score 
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier



*Функции*

In [2]:
# выводит статистику по заполненности набора данных
def print_stats(df):
    print(f"Количество полностью заполненных объектов: {len(df.dropna())}")
    print(f"Процент полностью заполненных объектов: {round(len(df.dropna()) / (len(df) / 100), 2)}")

In [3]:
# выводит процент пропущенных значений
def print_missing_values(df):
    percent_missing = (df.isna().sum() / (len(df) / 100)).sort_values(ascending=False)
    print(f"Процент пропущенных значений:\n{percent_missing}")

In [4]:
# выводит уникальные и часто встречаемые значения для группы колонок
def print_uniq_and_top(df, filled_column, bind_column):
    print("Уникальные значения ", bind_column, " для строк с заполненным ", filled_column, ":", df[bind_column].unique())
    print("Самое часто встречаемое значение ", bind_column, " для строк с заполненным ", filled_column, ":", df[bind_column].describe()['top'])

#### *Data Preparation*

In [63]:
df_out_pkl = 0
with open("data/ga_sessions.pkl", 'rb') as f:
    df_out_pkl = pickle.load(f)
df_out_pkl.head()

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
0,9055434745589932991.1637753792.1637753792,2108382700.1637757,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust
1,905544597018549464.1636867290.1636867290,210838531.16368672,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,,385x854,Samsung Internet,Russia,Moscow
2,9055446045651783499.1640648526.1640648526,2108385331.164065,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk
3,9055447046360770272.1622255328.1622255328,2108385564.1622252,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow
4,9055447046360770272.1622255345.1622255345,2108385564.1622252,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow


In [64]:
print_stats(df_out_pkl)

Количество полностью заполненных объектов: 14940
Процент полностью заполненных объектов: 0.8


In [65]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
device_model                99.121633
utm_keyword                 58.174009
device_os                   57.533002
utm_adcontent               18.043410
utm_campaign                11.806346
device_brand                 6.380394
utm_source                   0.005215
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
session_id                   0.000000
device_category              0.000000
client_id                    0.000000
utm_medium                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


##### Обработка пропущенных значений device_model

In [66]:
df_out_pkl['device_model'].describe()

count                    16338
unique                     104
top       AuMdmADEIoPXiWpTsBEj
freq                      9778
Name: device_model, dtype: object

In [67]:
top_device_model = df_out_pkl['device_model'].describe()['top']

Вывод пустых значений device_model

In [68]:
df_out_pkl[df_out_pkl['device_model'].isna()]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
0,9055434745589932991.1637753792.1637753792,2108382700.1637753791,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust
1,905544597018549464.1636867290.1636867290,210838531.1636867288,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,,385x854,Samsung Internet,Russia,Moscow
2,9055446045651783499.1640648526.1640648526,2108385331.1640648523,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk
3,9055447046360770272.1622255328.1622255328,2108385564.1622255328,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow
4,9055447046360770272.1622255345.1622255345,2108385564.1622255328,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860037,9055415581448263752.1640159305.1640159305,2108378238.1640159304,2021-12-22,10:48:25,1,BHcvLfOaCWvWTykYqHVe,cpc,,,VlqBmecIOXWjCWUmQkLd,desktop,Windows,,,1920x1080,Chrome,Russia,Moscow
1860038,9055421130527858185.1622007305.1622007305,2108379530.1622007305,2021-05-26,08:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Apple,,390x844,Safari,Russia,Stavropol
1860039,9055422955903931195.1636979515.1636979515,2108379955.1636979515,2021-11-15,15:31:55,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,,375x667,Safari,Russia,Moscow
1860040,905543020766873816.1638189404.1638189404,210838164.1638189272,2021-11-29,15:36:44,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,393x851,Chrome,Russia,Chelyabinsk


Выводим заполненные значения

In [69]:
df_dm_fill = df_out_pkl[~(df_out_pkl['device_model'].isna())]
df_dm_fill

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
165,9056202067269505745.1640092368.1640092368,2108561356.1640092369,2021-12-21,16:12:48,1,ZpYIoDJMcFzVoPFsHGJL,push,sbJRYgVfvcnqKJNDDYIr,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,384x854,Chrome,Russia,Moscow
208,9056422519349747445.1638964982.1638964982,2108612684.1638964981,2021-12-08,15:03:02,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,RrhnkuoaqckNtJpAZDzH,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,320x640,Chrome,Russia,Saint Petersburg
209,9056422519349747445.1638966940.1638966940,2108612684.1638964981,2021-12-08,15:35:40,2,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,RrhnkuoaqckNtJpAZDzH,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,320x640,Chrome,Russia,Saint Petersburg
224,9056468728897608385.1635659079.1635659079,2108623443.1633688257,2021-10-31,08:44:39,3,ZpYIoDJMcFzVoPFsHGJL,banner,gecBYcKZCPMcVYdSSzKP,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,360x820,Chrome,Russia,Rostov-on-Don
303,9056769488278983527.1635193702.1635193702,2108693469.1635193703,2021-10-25,23:28:22,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,RkpOyeEPgcMBSDuHLQcj,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,320x640,Chrome,Russia,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1859012,9051046216723789689.1635186553.1635186553,2107360916.1635186553,2021-10-25,21:29:13,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,EFePHapVShTKxBNclrhX,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,320x570,Chrome,Russia,Kommunar
1859362,9052676375860711226.1639943995.1639943995,2107740467.1639943994,2021-12-19,22:59:55,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,393x873,Chrome,Russia,Izhevsk
1859535,9053425229175288131.1636659526.1636659526,2107914823.1636659523,2021-11-11,22:38:46,1,faqsogjxCvbseFqupueU,banner,dZqEgyoxhtbeLFMtnnVR,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,385x769,Chrome,Russia,Samara
1859612,9053730012943175107.1640761072.1640761072,2107985786.1640320451,2021-12-29,09:57:52,2,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,360x760,Chrome,Russia,Vladivostok


In [70]:
print_uniq_and_top(df_dm_fill, 'device_model', 'device_category')
print_uniq_and_top(df_dm_fill, 'device_model', 'device_brand')
print_uniq_and_top(df_dm_fill, 'device_model', 'device_os')
print_uniq_and_top(df_dm_fill, 'device_model', 'device_browser')

Уникальные значения  device_category  для строк с заполненным  device_model : ['mobile' 'tablet' 'desktop']
Самое часто встречаемое значение  device_category  для строк с заполненным  device_model : mobile
Уникальные значения  device_brand  для строк с заполненным  device_model : ['(not set)']
Самое часто встречаемое значение  device_brand  для строк с заполненным  device_model : (not set)


Когда бренд не определяется, ему ставится значение модели AuMdmADEIoPXiWpTsBEj, которое соответствует любому устройству, в том числе мобильному, поэтому заполним им пустые значения device_model

In [71]:
df_out_pkl['device_model'] = df_out_pkl['device_model'].fillna(top_device_model)

In [72]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
utm_keyword                 58.174009
device_os                   57.533002
utm_adcontent               18.043410
utm_campaign                11.806346
device_brand                 6.380394
utm_source                   0.005215
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
device_model                 0.000000
session_id                   0.000000
device_category              0.000000
client_id                    0.000000
utm_medium                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


##### Обработка пропущенных значений utm_keyword

In [73]:
df_out_pkl['utm_keyword'].describe()

count                   777981
unique                    1219
top       puhZPIYqKXeFPaUviSjo
freq                    506819
Name: utm_keyword, dtype: object

Выводим заполненные значения

In [74]:
top_key_word = df_out_pkl['utm_keyword'].describe()['top']
df_ukw_fill = df_out_pkl[df_out_pkl['utm_keyword'] == top_key_word]
df_ukw_fill 

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
0,9055434745589932991.1637753792.1637753792,2108382700.1637753791,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,AuMdmADEIoPXiWpTsBEj,360x720,Chrome,Russia,Zlatoust
2,9055446045651783499.1640648526.1640648526,2108385331.1640648523,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,AuMdmADEIoPXiWpTsBEj,360x720,Chrome,Russia,Krasnoyarsk
8,9055462349345527315.1638536723.1638536723,2108389127.1638536723,2021-12-03,16:05:23,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,AuMdmADEIoPXiWpTsBEj,390x844,Safari,Russia,Moscow
13,9055469620715506713.1635878177.1635878177,2108390820.1628883993,2021-11-02,21:36:17,3,gVRrcxiDQubJiljoTbGm,referral,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Samsung,AuMdmADEIoPXiWpTsBEj,412x869,Android Webview,Russia,Sochi
16,9055487268745225369.1637983385.1637983385,2108394929.1637983385,2021-11-27,06:23:05,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,AuMdmADEIoPXiWpTsBEj,320x568,Safari,Russia,Saint Petersburg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860026,9055355469082180480.1636350848.1636350848,2108364242.1636350848,2021-11-08,08:54:08,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,AuMdmADEIoPXiWpTsBEj,393x873,Chrome,Russia,Moscow
1860033,9055394342833425189.1638599463.1638599463,2108373293.1638599461,2021-12-04,09:31:03,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Samsung,AuMdmADEIoPXiWpTsBEj,384x854,Chrome,Russia,Birsk
1860036,9055401700113249881.1639446112.1639446112,2108375006.1639446105,2021-12-14,04:41:52,1,ZpYIoDJMcFzVoPFsHGJL,banner,TmThBvoCcwkCZZUWACYq,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,ZTE,AuMdmADEIoPXiWpTsBEj,360x640,Chrome,Russia,Blagoveshchensk
1860039,9055422955903931195.1636979515.1636979515,2108379955.1636979515,2021-11-15,15:31:55,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,AuMdmADEIoPXiWpTsBEj,375x667,Safari,Russia,Moscow


Вывод пустые значения

In [75]:
df_out_pkl[df_out_pkl['utm_keyword'].isna()]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
3,9055447046360770272.1622255328.1622255328,2108385564.1622255328,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,,mobile,,Xiaomi,AuMdmADEIoPXiWpTsBEj,393x786,Chrome,Russia,Moscow
4,9055447046360770272.1622255345.1622255345,2108385564.1622255328,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,,mobile,,Xiaomi,AuMdmADEIoPXiWpTsBEj,393x786,Chrome,Russia,Moscow
5,9055447192389856083.1622453074.1622453074,2108385598.1622453075,2021-05-31,12:00:00,1,kjsLglQLzykiRbcDiGcD,organic,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Apple,AuMdmADEIoPXiWpTsBEj,375x812,Safari,Russia,Saint Petersburg
7,9055461992850812764.1626107740.1626107740,2108389044.1626107740,2021-07-12,19:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,,mobile,,Samsung,AuMdmADEIoPXiWpTsBEj,360x640,Chrome,Russia,Saint Petersburg
9,9055466554104774132.1624800757.1624800757,2108390106.1624800756,2021-06-27,16:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,,mobile,,Samsung,AuMdmADEIoPXiWpTsBEj,412x915,Chrome,Russia,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860030,9055382948278467242.1631877802.1631877802,2108370640.1631877802,2021-09-17,14:00:00,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,PkybGvWbaqORmxjNunqZ,,mobile,,Xiaomi,AuMdmADEIoPXiWpTsBEj,393x851,Chrome,Russia,Saint Petersburg
1860032,9055394269810294140.1629912447.1629912447,2108373276.1629912444,2021-08-25,20:00:00,1,bByPQxmDaMXgpHeypKSM,referral,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Samsung,AuMdmADEIoPXiWpTsBEj,360x800,Android Webview,Russia,Saint Petersburg
1860034,9055397194683347295.1630237022.1630237022,2108373957.1630237023,2021-08-29,14:00:00,1,ISrKoXQCxqqYvAZICvjs,blogger_stories,zfwIehuEfWYdYrEZgRLo,JNHcPlZPxEMWDnRiyoBf,,mobile,,Apple,AuMdmADEIoPXiWpTsBEj,414x896,Safari,Russia,Zheleznodorozhny
1860035,9055398929844789828.1624891784.1624891784,2108374361.1624891972,2021-06-28,17:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,vXsFkagGabkcWKlgLzSg,,,mobile,,Samsung,AuMdmADEIoPXiWpTsBEj,320x676,Chrome,Russia,Naro-Fominsk


In [76]:
print_uniq_and_top(df_ukw_fill, 'utm_keyword', 'utm_adcontent')

Уникальные значения  utm_adcontent  для строк с заполненным  utm_keyword : ['vCIpmpaGBnIQhyYNkXqp' 'JNHcPlZPxEMWDnRiyoBf' 'DZYjhfIUfdqhfuTNUmjn'
 'FkiRXDLOWtzVfvhEkhNo' 'ESUnXCsdWADovskBLvBO' 'SOkCdPxfUcZUzzOdgGES'
 'SAVVWaMghGnnvPOqMOIt' 'LxluDbGsLnaemhTtGuvB' nan 'qhEmhjPXvwgEHdBikgEQ'
 'EteMoEECGsaJeMnuvAZD' 'fxKLUhFToKQtGIyvjZXQ' 'LBoFGHDbSeBOgvTnNlmS'
 'DaehHXyBdjcdSRnPiAQn' 'nNqUcgFgcqQbTVSvgaHr' 'IyvBPOpVqcFCBjbgvbvx'
 'rqvMZiqGRTZpxvRSUTzX' 'xZYEHLyYdGXkJENJpTtu' 'nsxJgFVqhmchGMaUusie'
 'guyNoEvzgofQvvwExGOq' 'TGEEFuUxpSnXADfXkUsH' 'ZIMljraejFHmkkHvoNxk'
 'WAeycgIqKXoOMXPzDUDX' 'uovjRGXgBwVqoPWweONb' 'qukbsiXCRCiIMciUjStT'
 'SitoRrEOjouuWzzGooUa' 'imVqaQNUOBSidkTeZIuJ' 'FXpnPQVvfePoCAKRMpRV'
 'GpVVpqYEqQSmYZrOPfSZ' 'JKvVAMEfeoNMSMFzAAfE' 'OCpaogbJxWDpWXDqfzPq'
 'ODLcVlzKCdodjHQNkIoL' 'vVNkKrQAgRGFuwKyFazn' 'KkmURVtLRkaVcbcQEdxf'
 'AIONnJpjXjEluFHEjOyg' 'FTbuAcijicBuwxycfGPB' 'NnaLweGiAhdtkuktFipk'
 'HhkEQvDgekUQWMWUpfqW' 'JJRVNKFvKSInZxhrcjHK' 'GVUsmTlgLwSaIkbKDGtP'
 'NacUSAyeX

In [77]:
print_uniq_and_top(df_ukw_fill, 'utm_keyword', 'utm_campaign')

Уникальные значения  utm_campaign  для строк с заполненным  utm_keyword : ['LEoPHuyFvzoNfnzGgfcd' 'LTuZkdKfxRGVceoWkVyg' 'gecBYcKZCPMcVYdSSzKP'
 'zxoiLxhuSIFrCeTLQVWZ' 'TmThBvoCcwkCZZUWACYq' nan 'foFTSdUvNqqkPzZvgiqt'
 'bxOTvPtyGSdUrbwoXCPO' 'dZqEgyoxhtbeLFMtnnVR' 'UvuMsOSDBWQGOIbDbXfV'
 'sbJRYgVfvcnqKJNDDYIr' 'XGYOaJEasWTwAKNdCGVX' 'QdLfySaGXolfTBSNVfHn'
 'ascPqxFuFewWWZSVMpkh' 'SgIUDYUKnyWHVowUOqid' 'kVOrIKZFrEYGvixPclal'
 'jqlUOdZBNZYfInQVcZlS' 'LUlvACDKkkOkiSuiwaBs' 'vFcAhRxLfOWKhvxjELkx'
 'bgTYkDHjOsJzMUtoGhiQ' 'DZlFqIVHUBIDaQoarvIZ' 'WlbWUObZWvsimzdFdLYw'
 'kwdmElMUPDZaLQdgjcsI' 'PTQlxxEuqjyfVHcNKQQW' 'KgicpPxiEQfzPlPwQZJq'
 'nmfptFmSirEqNzAzqbXA' 'TIRYvHSoLonAvRZefPmz' 'KCcrgoFqYxCpSjdRyJjZ'
 'VBmazutCflYumtDHrQYe' 'EiQppLFrUZrUsjXVulLg' 'tnAqgCNATsNXcJwptHrh'
 'RhRtRKaMduWUvXxkhSyj' 'QOFLjxQSwjjdcmUyBzfz' 'nGFPxtyrBsOYBtJhrWEk'
 'UUEGZcMsxvxRpygjNVWe' 'ESphyUeLTPINiYALHWrO' 'iYBYglGljMDRQyqHRiPH'
 'YCKgTzTDywjcWyQudGch' 'UjApcvnaHtkydRkrLYuv' 'YDIkQmcjRkpdxGbLLtNN'
 'TxKUcPpthB

In [78]:
print_uniq_and_top(df_ukw_fill, 'utm_keyword', 'utm_medium')

Уникальные значения  utm_medium  для строк с заполненным  utm_keyword : ['banner' 'referral' '(none)' 'smm' 'blogger_channel' 'email' 'app'
 'vk_smm' 'push' 'partner' 'cpm' 'cpc' 'smartbanner' 'organic' 'outlook'
 'clicks' 'landing' 'blogger_stories' 'post' 'tg' 'qr' '(not set)'
 'fb_smm' 'nkp' 'google_cpc' 'blogger_header' 'users_msk' 'ok_smm'
 'Sbol_catalog' 'sms' 'landing_interests' 'yandex_cpc' 'web_polka'
 'linktest' 'cpa' 'medium' 'promo_sbol' 'desktop' 'dom_click' 'main_polka'
 'link' 'stories' 'promo_sber']
Самое часто встречаемое значение  utm_medium  для строк с заполненным  utm_keyword : banner


In [79]:
print_uniq_and_top(df_ukw_fill, 'utm_keyword', 'utm_source')

Уникальные значения  utm_source  для строк с заполненным  utm_keyword : ['ZpYIoDJMcFzVoPFsHGJL' 'gVRrcxiDQubJiljoTbGm' 'fDLlAcSmythWSCVMvqvL'
 'ISrKoXQCxqqYvAZICvjs' 'IZEXUFLARCUMynmHNBGo' 'jaSOmLICuBzCFqHfBdRg'
 'bByPQxmDaMXgpHeypKSM' 'nSReTmyFtbSjlPrTKoaX' 'dGlVSdmIlgWDyOPjfwwy'
 'GpAkIXsclxDGyILfNlrR' 'oZCzWSykfixnjMPDNjSU' 'dyicZQGoeASogoSafjEh'
 'QxAxdyPLuQMEcrdZWdWb' 'aXQzDWsJuGXeBXexNHjc' 'RmEBuqrriAfAVsLQQmhk'
 'kjsLglQLzykiRbcDiGcD' 'eimRuUrNhZLAYcwRrNXu' 'BHcvLfOaCWvWTykYqHVe'
 'YlsczTIyBSwTLNtuDkCd' 'KgicpPxiEQfzPlPwQZJq' 'DnEUulZAecfGPvdtZBYS'
 'GmILPdZyuAVJCPsUBHeN' 'XiUifkjKLLnomcDRhswp' 'YclHumxPxSxgzHfvCaeF'
 'oCqKpnSZJeYOVZTgTmKR' 'iNFgfQPqHPBuvGCYtrQE' 'ngkgBNjlzLYBofkljaBo'
 'faqsogjxCvbseFqupueU' 'nrKihqcWGIzDsOqljdAv' 'xEbgdGZJlqXAaRmeJQdW'
 'LlBOVIARRTjfgnQNjJre' 'TTtiRKFZIaQpIWggfCoF' 'hTjLvqNxGggkGnxSCaTm'
 'LigOnUObPodLDexszDtn' 'MvfHsxITijuriZxsqZqt' 'IRGUHqwEMepMjgCYBVRn'
 'QzPMrfYhYSLYYPtPaBxI' 'vNNYHvZtTVtJICHsjBBL' 'CFeqZLBNQdYHxJrTOHjY'
 'zwpKjjsMoRVCdipn

In [80]:
df_out_pkl[(df_out_pkl['utm_medium'] != 'banner')]['utm_keyword'].describe()

count                   457403
unique                    1219
top       puhZPIYqKXeFPaUviSjo
freq                    186241
Name: utm_keyword, dtype: object

In [81]:
df_out_pkl[(df_out_pkl['utm_source'] != 'ZpYIoDJMcFzVoPFsHGJL')]['utm_keyword'].describe()

count                   430706
unique                    1219
top       puhZPIYqKXeFPaUviSjo
freq                    159544
Name: utm_keyword, dtype: object

In [82]:
df_out_pkl[(df_out_pkl['utm_campaign'] != 'LEoPHuyFvzoNfnzGgfcd')]['utm_keyword'].describe()

count                   546808
unique                    1219
top       puhZPIYqKXeFPaUviSjo
freq                    275646
Name: utm_keyword, dtype: object

Таким образом, от связанных колонок самое часто встречаемое значение не зависит, что позволяет заполнить им пустые значения utm_keyword

In [83]:
df_out_pkl['utm_keyword'] = df_out_pkl['utm_keyword'].fillna(top_key_word)

In [84]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
device_os                   57.533002
utm_adcontent               18.043410
utm_campaign                11.806346
device_brand                 6.380394
utm_source                   0.005215
session_id                   0.000000
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
device_model                 0.000000
utm_keyword                  0.000000
device_category              0.000000
client_id                    0.000000
utm_medium                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


##### Обработка пропущенных значений device_os

In [85]:
df_out_pkl['device_os'].describe()

count      789904
unique         13
top       Android
freq       464054
Name: device_os, dtype: object

In [129]:
top_device_os = df_out_pkl['device_os'].describe()['top']
df_do_fill = df_out_pkl[~(df_out_pkl['device_os'].isna())]
df_do_fill 

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
0,9055434745589932991.1637753792.1637753792,2108382700.1637753791,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,AuMdmADEIoPXiWpTsBEj,360x720,Chrome,Russia,Zlatoust
1,905544597018549464.1636867290.1636867290,210838531.1636867288,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,AuMdmADEIoPXiWpTsBEj,385x854,Samsung Internet,Russia,Moscow
2,9055446045651783499.1640648526.1640648526,2108385331.1640648523,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,AuMdmADEIoPXiWpTsBEj,360x720,Chrome,Russia,Krasnoyarsk
3,9055447046360770272.1622255328.1622255328,2108385564.1622255328,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,AuMdmADEIoPXiWpTsBEj,393x786,Chrome,Russia,Moscow
4,9055447046360770272.1622255345.1622255345,2108385564.1622255328,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,AuMdmADEIoPXiWpTsBEj,393x786,Chrome,Russia,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860036,9055401700113249881.1639446112.1639446112,2108375006.1639446105,2021-12-14,04:41:52,1,ZpYIoDJMcFzVoPFsHGJL,banner,TmThBvoCcwkCZZUWACYq,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,ZTE,AuMdmADEIoPXiWpTsBEj,360x640,Chrome,Russia,Blagoveshchensk
1860037,9055415581448263752.1640159305.1640159305,2108378238.1640159304,2021-12-22,10:48:25,1,BHcvLfOaCWvWTykYqHVe,cpc,,,VlqBmecIOXWjCWUmQkLd,desktop,Windows,,AuMdmADEIoPXiWpTsBEj,1920x1080,Chrome,Russia,Moscow
1860039,9055422955903931195.1636979515.1636979515,2108379955.1636979515,2021-11-15,15:31:55,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,AuMdmADEIoPXiWpTsBEj,375x667,Safari,Russia,Moscow
1860040,905543020766873816.1638189404.1638189404,210838164.1638189272,2021-11-29,15:36:44,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,AuMdmADEIoPXiWpTsBEj,393x851,Chrome,Russia,Chelyabinsk


In [90]:
df_out_pkl[df_out_pkl['device_os'].isna()]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
3,9055447046360770272.1622255328.1622255328,2108385564.1622255328,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,puhZPIYqKXeFPaUviSjo,mobile,,Xiaomi,AuMdmADEIoPXiWpTsBEj,393x786,Chrome,Russia,Moscow
4,9055447046360770272.1622255345.1622255345,2108385564.1622255328,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,puhZPIYqKXeFPaUviSjo,mobile,,Xiaomi,AuMdmADEIoPXiWpTsBEj,393x786,Chrome,Russia,Moscow
5,9055447192389856083.1622453074.1622453074,2108385598.1622453075,2021-05-31,12:00:00,1,kjsLglQLzykiRbcDiGcD,organic,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,,Apple,AuMdmADEIoPXiWpTsBEj,375x812,Safari,Russia,Saint Petersburg
7,9055461992850812764.1626107740.1626107740,2108389044.1626107740,2021-07-12,19:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,,Samsung,AuMdmADEIoPXiWpTsBEj,360x640,Chrome,Russia,Saint Petersburg
9,9055466554104774132.1624800757.1624800757,2108390106.1624800756,2021-06-27,16:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,,Samsung,AuMdmADEIoPXiWpTsBEj,412x915,Chrome,Russia,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860030,9055382948278467242.1631877802.1631877802,2108370640.1631877802,2021-09-17,14:00:00,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,PkybGvWbaqORmxjNunqZ,puhZPIYqKXeFPaUviSjo,mobile,,Xiaomi,AuMdmADEIoPXiWpTsBEj,393x851,Chrome,Russia,Saint Petersburg
1860032,9055394269810294140.1629912447.1629912447,2108373276.1629912444,2021-08-25,20:00:00,1,bByPQxmDaMXgpHeypKSM,referral,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,,Samsung,AuMdmADEIoPXiWpTsBEj,360x800,Android Webview,Russia,Saint Petersburg
1860034,9055397194683347295.1630237022.1630237022,2108373957.1630237023,2021-08-29,14:00:00,1,ISrKoXQCxqqYvAZICvjs,blogger_stories,zfwIehuEfWYdYrEZgRLo,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,,Apple,AuMdmADEIoPXiWpTsBEj,414x896,Safari,Russia,Zheleznodorozhny
1860035,9055398929844789828.1624891784.1624891784,2108374361.1624891972,2021-06-28,17:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,vXsFkagGabkcWKlgLzSg,,puhZPIYqKXeFPaUviSjo,mobile,,Samsung,AuMdmADEIoPXiWpTsBEj,320x676,Chrome,Russia,Naro-Fominsk


Во многих случаях случаях device_os зависит от device_brand, и в этих записях мы точно можем заполнить отсутствующие значения

In [87]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == 'Apple'), 'device_os'].describe()

count     207098
unique         1
top          iOS
freq      207098
Name: device_os, dtype: object

In [88]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == 'Apple'), 'device_os'].unique()

array([None, 'iOS'], dtype=object)

In [39]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == 'Apple'), 'device_category'].unique()

array(['mobile', 'tablet', 'desktop'], dtype=object)

In [44]:
df_out_pkl.loc[((df_out_pkl['device_brand'] == 'Apple') & ((df_out_pkl['device_category'] == 'mobile') | (df_out_pkl['device_category'] == 'tablet')) & (df_out_pkl['device_os'].isna())), 'device_os']  = 'iOS'

In [47]:
df_out_pkl.loc[(df_out_pkl['device_brand'] != 'Apple'), 'device_os'].unique()

array(['Android', None, 'Windows', 'Linux', 'Macintosh', '(not set)',
       'Chrome OS', 'iOS', 'BlackBerry', 'Tizen', 'Firefox OS', 'Samsung',
       'Windows Phone', 'Nokia'], dtype=object)

In [95]:
df_do_fill.loc[(df_do_fill['device_brand'] != 'Apple'), 'device_os'].unique()

array(['Android'], dtype=object)

In [106]:
list_brands = df_do_fill.loc[(df_do_fill['device_brand'] != 'Apple'), 'device_brand'].unique()
list_brands

array(['Huawei', 'Samsung', 'Lenovo', 'Xiaomi', 'Meizu', 'OnePlus',
       'Realme', 'OPPO', '(not set)', 'Philips', 'Vivo', 'Nokia',
       'Alcatel', 'LG', 'BQ', 'Tecno', 'Asus', 'itel', 'Infinix', 'ZTE',
       'Wiko', 'Google', 'Sony', 'Wileyfox', 'Blackview', 'Cubot',
       'DOOGEE', 'DEXP', 'Motorola', 'TP-Link', 'Hisense', 'Acer',
       'Oukitel', 'LeEco', 'Prestigio', 'POCO', 'Vsmart', 'HTC',
       'Ulefone', 'CAT', 'Leagoo', 'InFocus', 'Inoi', 'BlackBerry',
       'Micromax', 'Umidigi', 'Sharp', 'Jiake', 'ZOJI', 'Yuntab',
       'Mozilla', 'Neffos', 'Highscreen', 'Karbonn', 'TCL', 'BLU',
       'Haier', 'Vertex', 'Coolpad', 'HOMTOM', 'LeTV', 'A1',
       'General Mobile', 'Gome', 'Egreat', 'Mito', 'SenseIT', 'Archos',
       'Keecoo', 'Vernee', 'Panasonic', 'InnJoo', 'Iris', 'Black Fox',
       'Lava', 'myPhone', 'Nomu', 'AGM', 'Nuu', 'UGOOS', 'Alldocube',
       'MTC', 'Komu', 'Qbex', 'Symphony', 'Wigor', 'Oysters', 'Fly',
       'Gionee', 'Artel', 'Ananda', 'Smartisan', '

In [103]:
df_out_pkl.loc[((df_out_pkl['device_brand'].isin(list(list_brands))) & ((df_out_pkl['device_category'] == 'mobile') | (df_out_pkl['device_category'] == 'tablet')) & (df_out_pkl['device_os'].isna())), 'device_os'] = 'Android'

In [117]:
df_out_pkl.loc[(df_out_pkl['device_category'] == 'desktop'), 'device_os'].unique()

array(['Windows', None, 'Linux', 'Macintosh', '(not set)', 'iOS',
       'Chrome OS', 'Android', 'Tizen'], dtype=object)

In [108]:
df_out_pkl.loc[(df_out_pkl['device_category'] == 'desktop'), 'device_os'].describe()

count      118640
unique          8
top       Windows
freq        88291
Name: device_os, dtype: object

In [118]:
df_out_pkl.loc[((df_out_pkl['device_category'] == 'desktop') & (df_out_pkl['device_os'].isna())), 'device_brand'].unique()

array(['', 'Xiaomi', 'Samsung', 'Huawei', 'Nokia', '(not set)', 'Asus',
       'Beelink', 'OPPO', 'OnePlus', 'Philips', 'Realme'], dtype=object)

In [122]:
df_out_pkl.loc[((df_out_pkl['device_brand'] == '') & (df_out_pkl['device_browser'] == 'Safari') & (df_out_pkl['device_os'].isna())), 'device_os'] = 'Macintosh'

In [126]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == '(not set)'), 'device_os'].unique()

array(['Android', 'iOS', '(not set)', 'Tizen', None, 'Windows Phone'],
      dtype=object)

In [128]:
df_out_pkl.loc[((df_out_pkl['device_brand'] == '(not set)') & (df_out_pkl['device_os'].isna())), 'device_browser'].unique()

array(['Samsung Internet', 'Chrome'], dtype=object)

In [123]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
device_os                   30.382701
utm_adcontent               18.043410
utm_campaign                11.806346
device_brand                 6.380394
utm_source                   0.005215
session_id                   0.000000
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
device_model                 0.000000
utm_keyword                  0.000000
device_category              0.000000
client_id                    0.000000
utm_medium                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64
