## **Обработка данных**

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import pickle

from matplotlib.ticker import FormatStrFormatter
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score 
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier



### *Функции*

In [2]:
# выводит статистику по заполненности набора данных
def print_stats(df):
    print(f"Количество полностью заполненных объектов: {len(df.dropna())}")
    print(f"Процент полностью заполненных объектов: {round(len(df.dropna()) / (len(df) / 100), 2)}")

In [3]:
# выводит процент пропущенных значений
def print_missing_values(df):
    percent_missing = (df.isna().sum() / (len(df) / 100)).sort_values(ascending=False)
    print(f"Процент пропущенных значений:\n{percent_missing}")

In [4]:
# выводит уникальные и часто встречаемые значения для группы колонок
def print_uniq_and_top(df, filled_column, bind_column):
    print("Уникальные значения ", bind_column, " для строк с заполненным ", filled_column, ":", df[bind_column].unique())
    print("Самое часто встречаемое значение ", bind_column, " для строк с заполненным ", filled_column, ":", df[bind_column].describe()['top'])

In [5]:
# принимает датафрейм df с filling_col, где есть пропуски, и filled_col, по которой эти пропуски будут заполняться соответствующим самым часто встречаемым значением среди строк с заполенной filling_col
def change_nans(df, filled_col, filling_col):
    df_fill = df[~(df[filling_col].isna())]
    list_cols = list(df_fill[filled_col].unique())
    for col in list_cols:
        df.loc[((df[filled_col].isin(list_cols)) & (df[filling_col].isna())), filling_col] = df_fill[df_fill[filled_col] == col][filling_col].describe()['top']
    return df

### Data Preparation

In [6]:
df_out_pkl = 0
with open("data/ga_sessions.pkl", 'rb') as f:
    df_out_pkl = pickle.load(f)

In [7]:
print_stats(df_out_pkl)

Количество полностью заполненных объектов: 14940
Процент полностью заполненных объектов: 0.8


In [8]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
device_model                99.121633
utm_keyword                 58.174009
device_os                   57.533002
utm_adcontent               18.043410
utm_campaign                11.806346
device_brand                 6.380394
utm_source                   0.005215
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
session_id                   0.000000
device_category              0.000000
client_id                    0.000000
utm_medium                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


##### Обработка пропусков utm_source

In [9]:
df_out_pkl.loc[(df_out_pkl['utm_source'].isna())].shape

(97, 18)

In [10]:
df_out_pkl.loc[(df_out_pkl['utm_source'].isna())]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
77652,1100240274910044288.1640728207.1640728207,256169651.1637310592,2021-12-29,00:50:07,2,,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,OnePlus,,412x892,Chrome,Russia,Saint Petersburg
122841,1303431342277938317.1637250145.1637250145,303478758.1637239949,2021-11-18,18:42:25,3,,Sbol_catalog,cccMlyVfjXspfaCSrMsO,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,,375x667,Safari,Russia,Balashikha
136220,1364914954463538089.1637595059.1637595059,317794027.1634397097,2021-11-22,18:30:59,8,,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,393x851,YaBrowser,Russia,Moscow
136221,1364914954463538089.1638478637.1638478637,317794027.1634397097,2021-12-02,23:57:17,9,,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,393x851,YaBrowser,Russia,Moscow
139706,1380043242326371104.1637518268.1637518268,321316356.1636477728,2021-11-21,21:11:08,4,,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Samsung,,360x740,Chrome,Russia,Izhevsk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1629120,8025307339522195714.1638678933.1638678933,1868537473.1636712706,2021-12-05,07:35:33,13,,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x780,Chrome,Russia,Sochi
1629121,8025307339522195714.1638688249.1638688249,1868537473.1636712706,2021-12-05,10:10:49,14,,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x780,Chrome,Russia,Krasnodar
1629122,8025307339522195714.1638954236.1638954236,1868537473.1636712706,2021-12-08,12:03:56,15,,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x780,Chrome,Ukraine,Pereval's'k
1732460,848873006059532776.1637459623.1637459623,197643648.1637396968,2021-11-21,04:53:43,2,,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,360x800,Chrome,Russia,Novosibirsk


In [11]:
df_out_pkl.loc[(df_out_pkl['utm_source'].isna())]['utm_medium'].unique()

array(['(not set)', 'Sbol_catalog', 'web_polka', 'CPM', 'promo_sbol'],
      dtype=object)

In [12]:
df_out_pkl.loc[(df_out_pkl['utm_source'].isna())]['utm_medium'].describe()

count            97
unique            5
top       (not set)
freq             44
Name: utm_medium, dtype: object

In [13]:
df_out_pkl.loc[(df_out_pkl['utm_medium'] == '(not set)')]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
14028,9117228372811440986.1636426594.1636426594,2122770150.1636426586,2021-11-09,05:56:34,1,iNFgfQPqHPBuvGCYtrQE,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Saint Petersburg
17049,9130968089155116617.1636081226.1636081226,2125969177.1636081225,2021-11-05,06:00:26,1,iNFgfQPqHPBuvGCYtrQE,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,,414x896,Safari,Russia,(not set)
20218,9144903415488255785.1637263046.1637263046,2129213748.1634670377,2021-11-18,22:17:26,3,WeIwsqEbpZGZwhcQktNS,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,OnePlus,,360x800,Chrome,Russia,Moscow
20219,9144903415488255785.1637295862.1637295862,2129213748.1634670377,2021-11-19,07:24:22,4,WeIwsqEbpZGZwhcQktNS,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,OnePlus,,360x800,Chrome,Russia,Moscow
23827,9160652639657579968.1637324018.1637324018,2132880650.1636357568,2021-11-19,15:13:38,3,VCREhgqUPSUkmfOTvGiW,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1920x1080,Edge,Russia,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1841970,8974027286922426989.1638693485.1638693485,2089428549.1638693485,2021-12-05,11:38:05,1,WeIwsqEbpZGZwhcQktNS,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,,414x896,Safari,Russia,Bor
1844085,8983191243891913373.1637645316.1637645316,2091562199.1637069469,2021-11-23,08:28:36,3,VCREhgqUPSUkmfOTvGiW,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1280x1024,Opera,Russia,Saratov
1851183,9015064428116524598.1629830709.1629830709,2098983253.1629830710,2021-08-24,21:00:00,1,WeIwsqEbpZGZwhcQktNS,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1440x900,Safari,Russia,Moscow
1854707,9031657626237725688.1624828920.1624828920,2102846658.1624828920,2021-06-28,00:00:00,1,iNFgfQPqHPBuvGCYtrQE,(not set),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Apple,,375x812,Safari,Russia,Moscow


In [14]:
df_out_pkl = change_nans(df_out_pkl, 'utm_medium', 'utm_source')

In [15]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
device_model                99.121633
utm_keyword                 58.174009
device_os                   57.533002
utm_adcontent               18.043410
utm_campaign                11.806346
device_brand                 6.380394
utm_source                   0.002849
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
session_id                   0.000000
device_category              0.000000
client_id                    0.000000
utm_medium                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


In [16]:
df_out_pkl['utm_source'].describe()

count                  1859989
unique                     293
top       ZpYIoDJMcFzVoPFsHGJL
freq                    578334
Name: utm_source, dtype: object

In [17]:
df_out_pkl['utm_source'] = df_out_pkl['utm_source'].fillna(df_out_pkl['utm_source'].describe()['top'])

In [18]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
device_model                99.121633
utm_keyword                 58.174009
device_os                   57.533002
utm_adcontent               18.043410
utm_campaign                11.806346
device_brand                 6.380394
device_category              0.000000
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
session_id                   0.000000
client_id                    0.000000
utm_medium                   0.000000
utm_source                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


##### Обработка device_brand

In [19]:
df_out_pkl.loc[(((df_out_pkl['device_browser'] == 'Safari') | (df_out_pkl['device_browser'] == '[FBAN')) & ((df_out_pkl['device_brand'].isna()) | (df_out_pkl['device_brand'] == '(not set)') | (df_out_pkl['device_brand'] == ''))), 'device_brand']  = 'Apple'

In [20]:
df_out_pkl.loc[(((df_out_pkl['device_brand'].isna()) | (df_out_pkl['device_brand'] == '(not set)') | (df_out_pkl['device_brand'] == '')) & (df_out_pkl['device_browser'] == 'Samsung Internet')), 'device_brand'] = 'Samsung'

In [21]:
df_out_pkl.loc[(((df_out_pkl['device_brand'].isna()) | (df_out_pkl['device_brand'] == '(not set)') | (df_out_pkl['device_brand'] == '')) & (df_out_pkl['device_category'] == 'desktop') & (df_out_pkl['device_os'] == 'Macintosh')), 'device_brand'] = 'Apple'

In [22]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
device_model                99.121633
utm_keyword                 58.174009
device_os                   57.533002
utm_adcontent               18.043410
utm_campaign                11.806346
device_brand                 5.038327
device_category              0.000000
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
session_id                   0.000000
client_id                    0.000000
utm_medium                   0.000000
utm_source                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


In [23]:
df_out_pkl.loc[((df_out_pkl['device_brand'].isna()) & (df_out_pkl['device_os'] == 'Macintosh')), 'device_brand'] = 'Apple'

In [24]:
df_out_pkl.loc[(df_out_pkl['device_brand'].isna())]['device_os'].unique()

array(['Windows', 'Linux', '(not set)', 'Chrome OS'], dtype=object)

In [25]:
df_out_pkl.loc[((df_out_pkl['device_brand'].isna()) & (df_out_pkl['device_category'] == 'desktop')), 'device_brand'] = '(not set)'

In [26]:
df_out_pkl[(df_out_pkl['device_brand'] == '(not set)')]['device_browser'].unique()

array(['Chrome', 'YaBrowser', 'Firefox', 'Opera', 'Edge', 'helloworld',
       'Android Webview', 'Mozilla Compatible Agent', 'Mozilla',
       'Instagram 213.0.0.29.120 Android',
       'Instagram 216.1.0.21.137 Android', 'Internet Explorer',
       'MRCHROME', 'Opera Mini', 'com.vk.vkclient', 'Puffin', 'Maxthon',
       'UC Browser', '(not set)', 'Coc Coc', 'Android',
       'Instagram 212.0.0.38.119 Android', 'Android Browser',
       'Android Runtime', 'Instagram 206.1.0.34.121 Android', 'SeaMonkey',
       'Threads 202.0.0.23.119'], dtype=object)

Приложение Threads закрыто в 2021-м году, поэтому данные, где оно используется, не пригодятся нам в анализе. Удаляем их

In [27]:
df_out_pkl = df_out_pkl[~(df_out_pkl['device_browser'].str.contains('Threads'))]

In [28]:
df_out_pkl[(df_out_pkl['device_brand'].isna())]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
119227,128743428747935717.1638907878.1638907878,29975415.1638907877,2021-12-07,23:11:18,1,kjsLglQLzykiRbcDiGcD,cpc,XHNUiSKKGTEpSAlaVMFQ,JNHcPlZPxEMWDnRiyoBf,hAmNSZmQkKQKAjZEGlgb,mobile,Linux,,,393x851,Chrome,Russia,(not set)
121378,1296701003971072834.1637258051.1637258051,301911729.1637258050,2021-11-18,20:54:11,1,kjsLglQLzykiRbcDiGcD,cpc,UjApcvnaHtkydRkrLYuv,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Linux,,,851x393,Chrome,Russia,Moscow
137577,1371041858455442706.1636723488.1636723488,319220558.1634571538,2021-11-12,16:24:48,6,kjsLglQLzykiRbcDiGcD,cpc,nSReTmyFtbSjlPrTKoaX,JNHcPlZPxEMWDnRiyoBf,pSXvqeeLXicMCzTYwAMy,mobile,Linux,,,424x942,Chrome,Russia,Nizhny Novgorod
265642,1946178047144129078.1638953526.1638953526,453129887.1638953526,2021-12-08,11:52:06,1,kjsLglQLzykiRbcDiGcD,cpc,WlbWUObZWvsimzdFdLYw,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Windows,,,1920x1080,Edge,Russia,Moscow
276129,1993060007474941190.1637141768.1637141768,464045444.1637141766,2021-11-17,12:36:08,1,kjsLglQLzykiRbcDiGcD,cpc,UjApcvnaHtkydRkrLYuv,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Linux,,,360x720,Chrome,Russia,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1671089,8213511827182896805.1639303366.1639303366,1912357245.1639237285,2021-12-12,13:02:46,2,kjsLglQLzykiRbcDiGcD,cpc,UjApcvnaHtkydRkrLYuv,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Linux,,,393x830,Chrome,Russia,Saint Petersburg
1704802,8365118374521985765.1635375847.1635375847,1947655895.1635375845,2021-10-28,02:04:07,1,kjsLglQLzykiRbcDiGcD,cpc,xhbLYKykcxmylecTsBop,JNHcPlZPxEMWDnRiyoBf,tzIeQUUHahhBuEkJqDVp,mobile,Linux,,,1600x1200,Chrome,India,Gurgaon
1716387,8416888273925634168.1635580025.1635580025,1959709514.1635580024,2021-10-30,10:47:05,1,kjsLglQLzykiRbcDiGcD,cpc,xhbLYKykcxmylecTsBop,JNHcPlZPxEMWDnRiyoBf,tzIeQUUHahhBuEkJqDVp,tablet,Linux,,,1600x1200,Chrome,India,Gurgaon
1724652,8453794644430050718.1636119335.1636119335,1968302448.1633310110,2021-11-05,16:35:35,2,kjsLglQLzykiRbcDiGcD,cpc,,,tzIeQUUHahhBuEkJqDVp,mobile,Linux,,,412x915,Chrome,Russia,Saint Petersburg


In [29]:
df_out_pkl[(df_out_pkl['device_brand'].isna())]['device_os'].unique()

array(['Linux', 'Windows'], dtype=object)

In [30]:
df_out_pkl[(df_out_pkl['device_brand'].isna())]['device_os'].describe()

count        67
unique        2
top       Linux
freq         51
Name: device_os, dtype: object

In [31]:
df_out_pkl.loc[(df_out_pkl['device_brand'].isna()), 'device_brand'] = '(not set)'

In [32]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
device_model                99.121632
utm_keyword                 58.173986
device_os                   57.532979
utm_adcontent               18.043419
utm_campaign                11.806353
device_category              0.000000
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
device_brand                 0.000000
session_id                   0.000000
client_id                    0.000000
utm_medium                   0.000000
utm_source                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


##### Обработка пропущенных значений device_model

In [33]:
df_out_pkl['device_model'].describe()

count                    16338
unique                     104
top       AuMdmADEIoPXiWpTsBEj
freq                      9778
Name: device_model, dtype: object

In [34]:
top_device_model = df_out_pkl['device_model'].describe()['top']

Вывод пустых значений device_model

In [35]:
df_out_pkl[df_out_pkl['device_model'].isna()]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
0,9055434745589932991.1637753792.1637753792,2108382700.1637753791,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust
1,905544597018549464.1636867290.1636867290,210838531.1636867288,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,,385x854,Samsung Internet,Russia,Moscow
2,9055446045651783499.1640648526.1640648526,2108385331.1640648523,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk
3,9055447046360770272.1622255328.1622255328,2108385564.1622255328,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow
4,9055447046360770272.1622255345.1622255345,2108385564.1622255328,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860037,9055415581448263752.1640159305.1640159305,2108378238.1640159304,2021-12-22,10:48:25,1,BHcvLfOaCWvWTykYqHVe,cpc,,,VlqBmecIOXWjCWUmQkLd,desktop,Windows,(not set),,1920x1080,Chrome,Russia,Moscow
1860038,9055421130527858185.1622007305.1622007305,2108379530.1622007305,2021-05-26,08:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Apple,,390x844,Safari,Russia,Stavropol
1860039,9055422955903931195.1636979515.1636979515,2108379955.1636979515,2021-11-15,15:31:55,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,,375x667,Safari,Russia,Moscow
1860040,905543020766873816.1638189404.1638189404,210838164.1638189272,2021-11-29,15:36:44,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,393x851,Chrome,Russia,Chelyabinsk


Заменяем значения на основе уже заполненных данных в колонке device_brand

In [36]:
df_out_pkl = change_nans(df_out_pkl, 'device_brand', 'device_model')


In [37]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
utm_keyword                 58.173986
device_os                   57.532979
device_model                25.774056
utm_adcontent               18.043419
utm_campaign                11.806353
device_category              0.000000
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
device_brand                 0.000000
session_id                   0.000000
client_id                    0.000000
utm_medium                   0.000000
utm_source                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


In [38]:
print_uniq_and_top(df_out_pkl[~(df_out_pkl['device_model'].isna())], 'device_model', 'device_category')
print_uniq_and_top(df_out_pkl[~(df_out_pkl['device_model'].isna())], 'device_model', 'device_brand')
print_uniq_and_top(df_out_pkl[~(df_out_pkl['device_model'].isna())], 'device_model', 'device_os')
print_uniq_and_top(df_out_pkl[~(df_out_pkl['device_model'].isna())], 'device_model', 'device_browser')

Уникальные значения  device_category  для строк с заполненным  device_model : ['mobile' 'desktop' 'tablet']
Самое часто встречаемое значение  device_category  для строк с заполненным  device_model : mobile
Уникальные значения  device_brand  для строк с заполненным  device_model : ['Samsung' 'Xiaomi' 'Apple' '(not set)' 'Vivo' 'Meizu' 'OnePlus' 'BQ'
 'Nokia' 'ZTE' 'Oukitel' 'Motorola' 'HTC' 'Symphony' 'Inoi' 'Coolpad'
 'Kata' 'Nuu' 'Advan' 'TurboPad' 'Dynamic']
Самое часто встречаемое значение  device_brand  для строк с заполненным  device_model : Apple
Уникальные значения  device_os  для строк с заполненным  device_model : ['Android' None 'iOS' 'Windows' 'Linux' 'Macintosh' '(not set)'
 'Chrome OS' 'Tizen' 'Samsung' 'Windows Phone' 'Nokia']
Самое часто встречаемое значение  device_os  для строк с заполненным  device_model : Android
Уникальные значения  device_browser  для строк с заполненным  device_model : ['Samsung Internet' 'Chrome' 'Safari' 'Android Webview' 'Safari (in-app)'
 'YaB

Когда бренд не определяется, ему ставится значение модели AuMdmADEIoPXiWpTsBEj, которое соответствует любому устройству, в том числе мобильному, поэтому заполним им оставшиеся пустые значения device_model

In [39]:
df_out_pkl['device_model'] = df_out_pkl['device_model'].fillna(top_device_model)

In [40]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
utm_keyword                 58.173986
device_os                   57.532979
utm_adcontent               18.043419
utm_campaign                11.806353
device_category              0.000000
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
device_model                 0.000000
device_brand                 0.000000
session_id                   0.000000
client_id                    0.000000
utm_medium                   0.000000
utm_source                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


##### Обработка пропущенных значений utm_keyword

In [41]:
df_out_pkl['utm_keyword'].describe()

count                   777981
unique                    1219
top       puhZPIYqKXeFPaUviSjo
freq                    506819
Name: utm_keyword, dtype: object

In [42]:
df_out_pkl = change_nans(df_out_pkl, 'utm_source', 'utm_keyword')
df_out_pkl = change_nans(df_out_pkl, 'utm_medium', 'utm_keyword')
print_missing_values(df_out_pkl)

Процент пропущенных значений:
device_os                   57.532979
utm_adcontent               18.043419
utm_campaign                11.806353
utm_keyword                  0.000161
device_category              0.000000
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
device_model                 0.000000
device_brand                 0.000000
session_id                   0.000000
client_id                    0.000000
utm_medium                   0.000000
utm_source                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


Выводим заполненные значения

In [43]:
top_key_word = df_out_pkl['utm_keyword'].describe()['top']
df_ukw_fill = df_out_pkl[~(df_out_pkl['utm_keyword'].isna())]
df_ukw_fill.head() 

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
0,9055434745589932991.1637753792.1637753792,2108382700.1637757,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,AuMdmADEIoPXiWpTsBEj,360x720,Chrome,Russia,Zlatoust
1,905544597018549464.1636867290.1636867290,210838531.16368672,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,qBRdfuuhOnnqwSqNiPOv,385x854,Samsung Internet,Russia,Moscow
2,9055446045651783499.1640648526.1640648526,2108385331.164065,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,AuMdmADEIoPXiWpTsBEj,360x720,Chrome,Russia,Krasnoyarsk
3,9055447046360770272.1622255328.1622255328,2108385564.1622252,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,puhZPIYqKXeFPaUviSjo,mobile,,Xiaomi,qBRdfuuhOnnqwSqNiPOv,393x786,Chrome,Russia,Moscow
4,9055447046360770272.1622255345.1622255345,2108385564.1622252,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,puhZPIYqKXeFPaUviSjo,mobile,,Xiaomi,qBRdfuuhOnnqwSqNiPOv,393x786,Chrome,Russia,Moscow


Вывод пустых значений

In [44]:
df_out_pkl[df_out_pkl['utm_keyword'].isna()]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
348176,2314621322702223639.1630927126.1630927126,538914772.1630927,2021-09-06,14:00:00,1,eBckTUycdWUDbWqXENLu,tablet,VbwhIlTuTQfiffXsGgWV,JNHcPlZPxEMWDnRiyoBf,,desktop,,,AuMdmADEIoPXiWpTsBEj,1536x864,Edge,Russia,Moscow
711650,3936481293849346139.1625047645.1625047645,916533473.1625048,2021-06-30,13:00:00,5,kKtSojgDlfomwthXhPjz,last,SbYAsCvXapXBOIxEKBZs,qHPyQVqWZtIwxZvzhkMv,,desktop,,,AuMdmADEIoPXiWpTsBEj,1440x900,Chrome,Russia,Moscow
1421278,7101726389746577562.1625075866.1625075866,1653499526.1625075,2021-06-30,20:00:00,1,tasNSXvMCFllwbrJviyg,qrcodevideo,dITetAFbdxWiRaArKHRu,izFaiVJBdrbEvBjAmFlH,,mobile,,Apple,qBRdfuuhOnnqwSqNiPOv,375x812,Safari,Russia,Moscow


In [45]:
print_uniq_and_top(df_ukw_fill, 'utm_keyword', 'utm_campaign')

Уникальные значения  utm_campaign  для строк с заполненным  utm_keyword : ['LEoPHuyFvzoNfnzGgfcd' 'FTjNLDyTrXaWYgZymFkV' nan 'LTuZkdKfxRGVceoWkVyg'
 'gecBYcKZCPMcVYdSSzKP' 'eimRuUrNhZLAYcwRrNXu' 'LwJZxKuWmvOhPsaCeRjG'
 'okTXSMadDkjvntEHzIjp' 'MXqmDyetMTICSSitTjWV' 'PXQWdUxeUoXfoKzTBGpY'
 'zxoiLxhuSIFrCeTLQVWZ' 'nSReTmyFtbSjlPrTKoaX' 'TmThBvoCcwkCZZUWACYq'
 'BVKxkCOHKUOvkpbrLMgZ' 'QEejXfOCtOMFLZqIPApp' 'kVOrIKZFrEYGvixPclal'
 'XHNUiSKKGTEpSAlaVMFQ' 'foFTSdUvNqqkPzZvgiqt' 'SgIUDYUKnyWHVowUOqid'
 'BAZCuyHZnaPrMGOMrcCQ' 'bxOTvPtyGSdUrbwoXCPO' 'DXVdsSTQphSYVmRchYKt'
 'GWZGdHKPgmZPNPwkJshU' 'JwYIveaHVpeeRZloQCfF' 'VBmazutCflYumtDHrQYe'
 'dZqEgyoxhtbeLFMtnnVR' 'UvuMsOSDBWQGOIbDbXfV' 'RoDitORHdzGfGhNCyEMy'
 'sbJRYgVfvcnqKJNDDYIr' 'XGYOaJEasWTwAKNdCGVX' 'dMIPlIFgKzafYgowsqtp'
 'EvhrtRzIJnQYHziPiLzV' 'AqudXfUnmXWSDWVGYaXr' 'RxecHElWobBxIeAkqFXV'
 'hkvDVxqLOzGjGaoNiNzN' 'QdLfySaGXolfTBSNVfHn' 'ULAUPJGgNiZYQgwZwZGR'
 'JajANoFxoqXfKRNBUhzx' 'UEtHtwAEXfprDUERwqqj' 'IKQsApKuPmZqqmhieEgf'
 'KgicpPxiEQ

In [46]:
print_uniq_and_top(df_ukw_fill, 'utm_keyword', 'utm_medium')

Уникальные значения  utm_medium  для строк с заполненным  utm_keyword : ['banner' 'cpm' 'cpc' 'organic' '(none)' 'referral' 'smm' 'stories'
 'blogger_channel' 'blogger_stories' 'email' 'app' 'vk_smm' 'cpv' 'push'
 'partner' 'tg' 'cpa' 'post' 'smartbanner' 'info_text' 'outlook' 'clicks'
 'landing' 'blogger_header' 'qr' '(not set)' 'fb_smm' 'nkp' 'google_cpc'
 'sms' 'article' 'users_msk' 'static' 'ok_smm' 'cbaafe' 'Sbol_catalog'
 'landing_interests' 'yandex_cpc' 'web_polka' 'linktest' 'sber_app' 'CPM'
 'medium' 'promo_sbol' 'reach' 'desktop' 'dom_click' 'social' 'catalogue'
 'main_polka' 'link' 'promo_sber']
Самое часто встречаемое значение  utm_medium  для строк с заполненным  utm_keyword : banner


In [47]:
print_uniq_and_top(df_ukw_fill, 'utm_keyword', 'utm_source')

Уникальные значения  utm_source  для строк с заполненным  utm_keyword : ['ZpYIoDJMcFzVoPFsHGJL' 'MvfHsxITijuriZxsqZqt' 'kjsLglQLzykiRbcDiGcD'
 'TxKUcPpthBDPieTGmVhx' 'fDLlAcSmythWSCVMvqvL' 'gVRrcxiDQubJiljoTbGm'
 'SzZERoLMmrEUEhDaYcyN' 'BHcvLfOaCWvWTykYqHVe' 'klTrhUaShgnjIbaPmqjc'
 'ghoaGAksqhKomdFrxgyJ' 'vFcAhRxLfOWKhvxjELkx' 'QxAxdyPLuQMEcrdZWdWb'
 'nSReTmyFtbSjlPrTKoaX' 'ISrKoXQCxqqYvAZICvjs' 'hTjLvqNxGggkGnxSCaTm'
 'RmEBuqrriAfAVsLQQmhk' 'IZEXUFLARCUMynmHNBGo' 'bByPQxmDaMXgpHeypKSM'
 'jaSOmLICuBzCFqHfBdRg' 'PlbkrSYoHuZBWfYjYnfw' 'dGlVSdmIlgWDyOPjfwwy'
 'gDBGzjFKYabGgSPZvrDH' 'NGNkCWwKgYFmiCCeZVxg' 'GpAkIXsclxDGyILfNlrR'
 'oZCzWSykfixnjMPDNjSU' 'dyicZQGoeASogoSafjEh' 'fgymSoTvjKPEgaIJqsiH'
 'aXQzDWsJuGXeBXexNHjc' 'fbFKcMumlScApQMqFIqp' 'eLzNJHzPelJpEyBwMrKo'
 'ZHCJROlbqnkXTqIuVxnm' 'iNFgfQPqHPBuvGCYtrQE' 'nmfptFmSirEqNzAzqbXA'
 'maiZOsuEAMdeoRVsYoFk' 'HbolMJUevblAbkHClEQa' 'cAqxcRdSSFAyCPUxQHqy'
 'geDcueAOghDzHkGMmdOq' 'DnEUulZAecfGPvdtZBYS' 'HFaOtpcChAlcMuxEAlpu'
 'FTAuYVNoYYxgvKMp

In [48]:
df_out_pkl[(df_out_pkl['utm_medium'] != 'banner')]['utm_keyword'].describe()

count                  1307766
unique                    1219
top       puhZPIYqKXeFPaUviSjo
freq                   1036604
Name: utm_keyword, dtype: object

In [49]:
df_out_pkl[(df_out_pkl['utm_source'] != 'ZpYIoDJMcFzVoPFsHGJL')]['utm_keyword'].describe()

count                  1281651
unique                    1219
top       puhZPIYqKXeFPaUviSjo
freq                   1010529
Name: utm_keyword, dtype: object

In [50]:
df_out_pkl[(df_out_pkl['utm_campaign'] != 'LEoPHuyFvzoNfnzGgfcd')]['utm_keyword'].describe()

count                  1535994
unique                    1219
top       puhZPIYqKXeFPaUviSjo
freq                   1264832
Name: utm_keyword, dtype: object

In [51]:
df_ukw_fill['utm_keyword'].unique() 

array(['puhZPIYqKXeFPaUviSjo', 'IGUCNvHlhfHpROGclCit',
       'PwscUHjoUJDrtfWESIHj', ..., 'aCaBoYaQJPVffhjBQnut',
       'RaeBwzCLChMDgYYukNOw', 'fcXWTQaKfxbkBkBnzLhK'], dtype=object)

In [52]:
df_ukw_fill['utm_keyword'].describe() 

count                  1860038
unique                    1219
top       puhZPIYqKXeFPaUviSjo
freq                   1588876
Name: utm_keyword, dtype: object

Таким образом, от связанных колонок самое часто встречаемое значение не зависит, что позволяет заполнить им пустые значения utm_keyword

In [53]:
df_out_pkl['utm_keyword'] = df_out_pkl['utm_keyword'].fillna(top_key_word)

In [54]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
device_os                   57.532979
utm_adcontent               18.043419
utm_campaign                11.806353
session_id                   0.000000
device_category              0.000000
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
device_model                 0.000000
device_brand                 0.000000
utm_keyword                  0.000000
client_id                    0.000000
utm_medium                   0.000000
utm_source                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


##### Обработка пропущенных значений device_os

In [55]:
df_out_pkl['device_os'].describe()

count      789904
unique         13
top       Android
freq       464054
Name: device_os, dtype: object

Запоминаем отдельно самые часто встречающиеся значения для каждой категории

In [56]:
top_device_os_m = df_out_pkl.loc[(df_out_pkl['device_category'] == 'mobile'), 'device_os'].describe()['top']
top_device_os_t = df_out_pkl.loc[(df_out_pkl['device_category'] == 'tablet'), 'device_os'].describe()['top']
top_device_os_d = df_out_pkl.loc[(df_out_pkl['device_category'] == 'desktop'), 'device_os'].describe()['top']

In [57]:
df_do_fill = df_out_pkl[~(df_out_pkl['device_os'].isna())]
df_do_fill 

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
0,9055434745589932991.1637753792.1637753792,2108382700.1637753791,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,AuMdmADEIoPXiWpTsBEj,360x720,Chrome,Russia,Zlatoust
1,905544597018549464.1636867290.1636867290,210838531.1636867288,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,qBRdfuuhOnnqwSqNiPOv,385x854,Samsung Internet,Russia,Moscow
2,9055446045651783499.1640648526.1640648526,2108385331.1640648523,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,AuMdmADEIoPXiWpTsBEj,360x720,Chrome,Russia,Krasnoyarsk
6,9055455318486370642.1640843788.1640843788,2108387490.1640843602,2021-12-30,08:56:28,1,TxKUcPpthBDPieTGmVhx,cpc,FTjNLDyTrXaWYgZymFkV,LcGIUNPUAmXtQJaDfFBR,PwscUHjoUJDrtfWESIHj,tablet,Android,Lenovo,AuMdmADEIoPXiWpTsBEj,602x1029,YaBrowser,Russia,Saint Petersburg
8,9055462349345527315.1638536723.1638536723,2108389127.1638536723,2021-12-03,16:05:23,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,qBRdfuuhOnnqwSqNiPOv,390x844,Safari,Russia,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860036,9055401700113249881.1639446112.1639446112,2108375006.1639446105,2021-12-14,04:41:52,1,ZpYIoDJMcFzVoPFsHGJL,banner,TmThBvoCcwkCZZUWACYq,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,ZTE,qBRdfuuhOnnqwSqNiPOv,360x640,Chrome,Russia,Blagoveshchensk
1860037,9055415581448263752.1640159305.1640159305,2108378238.1640159304,2021-12-22,10:48:25,1,BHcvLfOaCWvWTykYqHVe,cpc,,,VlqBmecIOXWjCWUmQkLd,desktop,Windows,(not set),qBRdfuuhOnnqwSqNiPOv,1920x1080,Chrome,Russia,Moscow
1860039,9055422955903931195.1636979515.1636979515,2108379955.1636979515,2021-11-15,15:31:55,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,qBRdfuuhOnnqwSqNiPOv,375x667,Safari,Russia,Moscow
1860040,905543020766873816.1638189404.1638189404,210838164.1638189272,2021-11-29,15:36:44,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,qBRdfuuhOnnqwSqNiPOv,393x851,Chrome,Russia,Chelyabinsk


In [58]:
df_out_pkl[df_out_pkl['device_os'].isna()]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
3,9055447046360770272.1622255328.1622255328,2108385564.1622255328,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,puhZPIYqKXeFPaUviSjo,mobile,,Xiaomi,qBRdfuuhOnnqwSqNiPOv,393x786,Chrome,Russia,Moscow
4,9055447046360770272.1622255345.1622255345,2108385564.1622255328,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,puhZPIYqKXeFPaUviSjo,mobile,,Xiaomi,qBRdfuuhOnnqwSqNiPOv,393x786,Chrome,Russia,Moscow
5,9055447192389856083.1622453074.1622453074,2108385598.1622453075,2021-05-31,12:00:00,1,kjsLglQLzykiRbcDiGcD,organic,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,,Apple,qBRdfuuhOnnqwSqNiPOv,375x812,Safari,Russia,Saint Petersburg
7,9055461992850812764.1626107740.1626107740,2108389044.1626107740,2021-07-12,19:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,,Samsung,qBRdfuuhOnnqwSqNiPOv,360x640,Chrome,Russia,Saint Petersburg
9,9055466554104774132.1624800757.1624800757,2108390106.1624800756,2021-06-27,16:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,,Samsung,qBRdfuuhOnnqwSqNiPOv,412x915,Chrome,Russia,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860030,9055382948278467242.1631877802.1631877802,2108370640.1631877802,2021-09-17,14:00:00,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,PkybGvWbaqORmxjNunqZ,puhZPIYqKXeFPaUviSjo,mobile,,Xiaomi,qBRdfuuhOnnqwSqNiPOv,393x851,Chrome,Russia,Saint Petersburg
1860032,9055394269810294140.1629912447.1629912447,2108373276.1629912444,2021-08-25,20:00:00,1,bByPQxmDaMXgpHeypKSM,referral,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,,Samsung,qBRdfuuhOnnqwSqNiPOv,360x800,Android Webview,Russia,Saint Petersburg
1860034,9055397194683347295.1630237022.1630237022,2108373957.1630237023,2021-08-29,14:00:00,1,ISrKoXQCxqqYvAZICvjs,blogger_stories,zfwIehuEfWYdYrEZgRLo,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,,Apple,qBRdfuuhOnnqwSqNiPOv,414x896,Safari,Russia,Zheleznodorozhny
1860035,9055398929844789828.1624891784.1624891784,2108374361.1624891972,2021-06-28,17:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,vXsFkagGabkcWKlgLzSg,,puhZPIYqKXeFPaUviSjo,mobile,,Samsung,qBRdfuuhOnnqwSqNiPOv,320x676,Chrome,Russia,Naro-Fominsk


Во многих случаях случаях device_os зависит от device_brand, и в этих записях мы точно можем заполнить отсутствующие значения

In [59]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == 'Apple'), 'device_os'].describe()

count     231958
unique         5
top          iOS
freq      207104
Name: device_os, dtype: object

In [60]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == 'Apple'), 'device_os'].unique()

array([None, 'iOS', 'Macintosh', 'Linux', '(not set)', 'Windows Phone'],
      dtype=object)

In [61]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == 'Apple'), 'device_category'].unique()

array(['mobile', 'tablet', 'desktop'], dtype=object)

In [62]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == 'Apple'), 'device_category'].describe()

count     603640
unique         3
top       mobile
freq      544829
Name: device_category, dtype: object

In [63]:
df_out_pkl.loc[((df_out_pkl['device_brand'] == 'Apple') & ((df_out_pkl['device_category'] == 'mobile') | (df_out_pkl['device_category'] == 'tablet')) & (df_out_pkl['device_os'].isna())), 'device_os']  = 'iOS'

In [64]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == 'BlackBerry'), 'device_os'].unique()

array([None, 'Android', 'BlackBerry'], dtype=object)

In [65]:
df_out_pkl.loc[((df_out_pkl['device_brand'] == 'BlackBerry') & (df_out_pkl['device_os'] == 'BlackBerry'))].shape

(27, 18)

In [66]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == 'BlackBerry'), 'device_os'].describe()

count          92
unique          2
top       Android
freq           65
Name: device_os, dtype: object

BlackBerry OS - достаточно редкая в текущее время система, и при анализе она не окажет существенного влияния на результат, поэтому заполняем device_os для этого бренда значением самой часто встречаемой ОС  

In [67]:
df_out_pkl.loc[(df_out_pkl['device_brand'] == 'BlackBerry'), 'device_os'] = top_device_os_m

In [68]:
df_out_pkl.loc[(df_out_pkl['device_brand'] != 'Apple'), 'device_os'].unique()

array(['Android', None, 'Windows', 'Linux', '(not set)', 'Chrome OS',
       'Tizen', 'Firefox OS', 'Samsung', 'Windows Phone', 'Nokia'],
      dtype=object)

In [69]:
df_do_fill.loc[(df_do_fill['device_brand'] != 'Apple'), 'device_os'].unique()

array(['Android', 'Windows', 'Linux', '(not set)', 'Chrome OS',
       'BlackBerry', 'Tizen', 'Firefox OS', 'Samsung', 'Windows Phone',
       'Nokia'], dtype=object)

In [70]:
print(f"Самые часто встречаемые мобильные ОС: {top_device_os_m} \nПланшетные ОС: {top_device_os_t}")

Самые часто встречаемые мобильные ОС: Android 
Планшетные ОС: Android


In [71]:
list_brands = df_do_fill.loc[(((df_do_fill['device_os'] == top_device_os_m) | (df_do_fill['device_os'] == top_device_os_t)) & ((df_do_fill['device_category'] == 'mobile') | (df_do_fill['device_category'] == 'tablet'))), 'device_brand'].unique()
list_brands

array(['Huawei', 'Samsung', 'Lenovo', 'Xiaomi', 'Meizu', 'OnePlus',
       'Realme', 'OPPO', '(not set)', 'Philips', 'Vivo', 'Nokia',
       'Alcatel', 'LG', 'BQ', 'Tecno', 'Asus', 'itel', 'Infinix', 'ZTE',
       'Wiko', 'Google', 'Sony', 'Wileyfox', 'Blackview', 'Cubot',
       'DOOGEE', 'DEXP', 'Motorola', 'TP-Link', 'Hisense', 'Acer',
       'Oukitel', 'LeEco', 'Prestigio', 'POCO', 'Vsmart', 'HTC',
       'Ulefone', 'CAT', 'Leagoo', 'InFocus', 'Inoi', 'BlackBerry',
       'Micromax', 'Umidigi', 'Sharp', 'Jiake', 'ZOJI', 'Yuntab',
       'Mozilla', 'Neffos', 'Highscreen', 'Karbonn', 'TCL', 'BLU',
       'Haier', 'Vertex', 'Coolpad', 'HOMTOM', 'LeTV', 'A1',
       'General Mobile', 'Gome', 'Egreat', 'Mito', 'SenseIT', 'Archos',
       'Keecoo', 'Vernee', 'Panasonic', 'InnJoo', 'Iris', 'Black Fox',
       'Lava', 'myPhone', 'Nomu', 'AGM', 'Nuu', 'UGOOS', 'Alldocube',
       'MTC', 'Komu', 'Qbex', 'Symphony', 'Wigor', 'Oysters', 'Fly',
       'Gionee', 'Artel', 'Ananda', 'Smartisan', '

In [72]:
df_out_pkl.loc[((df_out_pkl['device_brand'].isin(list(list_brands))) & (df_out_pkl['device_category'] == 'mobile') & (df_out_pkl['device_os'].isna())), 'device_os'] = top_device_os_m
df_out_pkl.loc[((df_out_pkl['device_brand'].isin(list(list_brands))) & (df_out_pkl['device_category'] == 'tablet') & (df_out_pkl['device_os'].isna())), 'device_os'] = top_device_os_t

Теперь проверяем характеристики device_browser и device_category

In [73]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_browser'].unique()

array(['YaBrowser', 'Chrome', 'Safari', 'Firefox', 'Opera', 'Edge',
       'Instagram 208.0.0.32.135 Android',
       'Instagram 209.0.0.21.119 Android', '(not set)',
       'Mozilla Compatible Agent', 'Coc Coc', 'Samsung Internet',
       'Android', '[FBAN', 'Puffin', 'Internet Explorer', 'MRCHROME',
       'Instagram 199.1.0.34.119 Android', 'UC Browser', 'SeaMonkey',
       'Instagram 194.0.0.36.172 Android',
       'Instagram 202.0.0.37.123 Android', 'Mozilla',
       'Instagram 192.0.0.35.123 Android', 'Maxthon', 'Android Webview',
       'Instagram 158.0.0.30.123 Android', 'Konqueror'], dtype=object)

Встроенные браузеры так же позволяют однозначно определить систему

In [74]:
df_out_pkl.loc[((df_out_pkl['device_browser'] == 'Safari') & (df_out_pkl['device_os'].isna()) & (df_out_pkl['device_category'] == 'desktop')), 'device_os'] = 'Macintosh'

In [75]:
df_out_pkl.loc[((df_out_pkl['device_browser'] == 'Safari') & (df_out_pkl['device_os'].isna()) & (df_out_pkl['device_category'] != 'desktop')), 'device_os'] = 'iOS'

In [76]:
df_out_pkl.loc[((df_out_pkl['device_os'].isna()) & (df_out_pkl['device_browser'] == 'Samsung Internet')), 'device_os'] = 'Android'

In [77]:
df_out_pkl.loc[((df_out_pkl['device_os'].isna()) & ((df_out_pkl['device_browser'] == 'Edge') | (df_out_pkl['device_browser'] == 'Internet Explorer'))), 'device_os'] = 'Windows'

In [78]:
df_out_pkl.loc[((df_out_pkl['device_os'].isna()) & (df_out_pkl['device_browser'].str.contains('Android', na=False))), 'device_os'] = 'Android' 

In [79]:
df_out_pkl.loc[(df_out_pkl['device_browser'] == 'Mozilla Compatible Agent'), 'device_category'].unique()

array(['desktop'], dtype=object)

In [80]:
df_out_pkl.loc[((df_out_pkl['device_browser'] == 'Mozilla Compatible Agent') & (df_out_pkl['device_os'].isna())), 'device_category'].unique()

array(['desktop'], dtype=object)

In [81]:
df_out_pkl.loc[(df_out_pkl['device_browser'] == 'Mozilla Compatible Agent'), 'device_os'].describe()

count           302
unique            3
top       (not set)
freq            260
Name: device_os, dtype: object

In [82]:
df_out_pkl.loc[(df_out_pkl['device_browser'] == 'Mozilla Compatible Agent'), 'device_brand'].describe()

count           362
unique            3
top       (not set)
freq            275
Name: device_brand, dtype: object

In [83]:
df_out_pkl.loc[(df_out_pkl['device_browser'] == 'Mozilla Compatible Agent')]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
8519,9093064959820033783.1637871353.1637871353,2117144167.1637871351,2021-11-25,23:15:53,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,(not set),(not set),qBRdfuuhOnnqwSqNiPOv,800x600,Mozilla Compatible Agent,Iran,(not set)
10828,9102899502911587345.1629488145.1629488145,2119433950.1629488145,2021-08-20,22:00:00,1,nSReTmyFtbSjlPrTKoaX,banner,BVKxkCOHKUOvkpbrLMgZ,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,,,AuMdmADEIoPXiWpTsBEj,768x1024,Mozilla Compatible Agent,Russia,Saint Petersburg
24764,9164669134152695542.1630043894.1630043894,2133815813.1630043894,2021-08-27,08:00:00,1,nSReTmyFtbSjlPrTKoaX,banner,BVKxkCOHKUOvkpbrLMgZ,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,,,AuMdmADEIoPXiWpTsBEj,768x1024,Mozilla Compatible Agent,Russia,Moscow
29900,9187225035383329340.1637438015.1637438015,2139067518.1637438012,2021-11-20,22:53:35,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,(not set),(not set),qBRdfuuhOnnqwSqNiPOv,800x600,Mozilla Compatible Agent,Russia,Nizhny Tagil
31090,9192716748074205719.1639924247.1639924247,2140346157.1639924247,2021-12-19,17:30:47,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,(not set),(not set),qBRdfuuhOnnqwSqNiPOv,800x600,Mozilla Compatible Agent,Russia,(not set)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1858913,905062508407902236.1630359614.1630359614,210726286.1630359580,2021-08-31,00:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,puhZPIYqKXeFPaUviSjo,desktop,,,AuMdmADEIoPXiWpTsBEj,1920x1080,Mozilla Compatible Agent,Russia,Saint Petersburg
1858914,905062508407902236.1630359673.1630359673,210726286.1630359580,2021-08-31,00:00:00,3,kjsLglQLzykiRbcDiGcD,cpc,bJJuEXRheRIxXEaYIXqM,,puhZPIYqKXeFPaUviSjo,desktop,,,AuMdmADEIoPXiWpTsBEj,1920x1080,Mozilla Compatible Agent,Russia,Saint Petersburg
1858915,905062508407902236.1630359674.1630359674,210726286.1630359580,2021-08-31,00:00:00,4,kjsLglQLzykiRbcDiGcD,cpc,,,puhZPIYqKXeFPaUviSjo,desktop,,,AuMdmADEIoPXiWpTsBEj,1920x1080,Mozilla Compatible Agent,Russia,Saint Petersburg
1859651,9053936214320221766.1637486150.1637486167,2108033796.1637486150,2021-11-21,12:16:07,1,bByPQxmDaMXgpHeypKSM,referral,QdLfySaGXolfTBSNVfHn,SOkCdPxfUcZUzzOdgGES,puhZPIYqKXeFPaUviSjo,desktop,Linux,(not set),qBRdfuuhOnnqwSqNiPOv,1600x900,Mozilla Compatible Agent,Russia,Moscow


Mozilla Compatible Agent определяется как браузер в данных траффика в случаях, когда реальные люди заходят через VPN, когда заходят боты или происходит ошибка в whois, к тому же, этих данных немного, поэтому их можно удалить

In [84]:
df_out_pkl = df_out_pkl.loc[~(df_out_pkl['device_browser'] == 'Mozilla Compatible Agent')]

FBAN - это встроенный в приложение Facebook браузер на iOS, поэтому его мы тоже можем определить

In [85]:
df_out_pkl.loc[((df_out_pkl['device_os'].isna()) & (df_out_pkl['device_browser'] == '[FBAN')), 'device_os']  = 'iOS'

Посмотрим, какие ещё строки у нас остались с незаполненным device_os

In [86]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_browser'].unique()

array(['YaBrowser', 'Chrome', 'Firefox', 'Opera', '(not set)', 'Coc Coc',
       'Puffin', 'MRCHROME', 'UC Browser', 'SeaMonkey', 'Mozilla',
       'Maxthon', 'Konqueror'], dtype=object)

In [87]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_browser'].describe()

count     210518
unique        13
top       Chrome
freq      147684
Name: device_browser, dtype: object

In [88]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_category'].unique()

array(['desktop', 'mobile', 'tablet'], dtype=object)

In [89]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_category'].describe()

count      210518
unique          3
top       desktop
freq       210239
Name: device_category, dtype: object

Рассмотрим категорию desktop

In [90]:
df_out_pkl.loc[(df_out_pkl['device_category'] == 'desktop'), 'device_os'].describe()

count      156262
unique          8
top       Windows
freq        98121
Name: device_os, dtype: object

In [91]:
df_out_pkl.loc[((df_out_pkl['device_category'] == 'desktop') & (df_out_pkl['device_os'].isna())), 'device_brand'].unique()

array(['', 'Xiaomi', 'Huawei', 'Samsung', 'Nokia', 'Asus', 'Beelink',
       'OPPO', 'Apple', '(not set)', 'OnePlus', 'Philips', 'Realme'],
      dtype=object)

Очевидно, что декстопный компьютер бренда Apple управляется ОС Macintosh

In [92]:
df_out_pkl.loc[((df_out_pkl['device_category'] == 'desktop') & (df_out_pkl['device_os'].isna()) & (df_out_pkl['device_brand'] == 'Apple')), 'device_os'] = 'Macintosh'

Взглянем на оставшиеся после этих действий бренды, категории и браузеры

In [93]:
df_out_pkl.loc[((df_out_pkl['device_category'] == 'desktop') & (df_out_pkl['device_os'].isna())), 'device_brand'].unique()

array(['', 'Xiaomi', 'Huawei', 'Samsung', 'Nokia', 'Asus', 'Beelink',
       'OPPO', '(not set)', 'OnePlus', 'Philips', 'Realme'], dtype=object)

In [94]:
df_out_pkl.loc[((df_out_pkl['device_category'] == 'desktop') & (df_out_pkl['device_os'].isna())), 'device_brand'].describe()

count     210236
unique        12
top             
freq      210195
Name: device_brand, dtype: object

In [95]:
df_out_pkl.loc[((df_out_pkl['device_brand'] == '') & (df_out_pkl['device_os'].isna())), 'device_browser'].unique()

array(['YaBrowser', 'Chrome', 'Firefox', 'Opera', '(not set)', 'Coc Coc',
       'Puffin', 'MRCHROME', 'UC Browser', 'SeaMonkey', 'Mozilla',
       'Maxthon', 'Konqueror'], dtype=object)

In [96]:
df_out_pkl.loc[((df_out_pkl['device_brand'] == '') & (df_out_pkl['device_os'].isna())), 'device_browser'].describe()

count     210407
unique        13
top       Chrome
freq      147585
Name: device_browser, dtype: object

In [97]:
df_out_pkl.loc[((df_out_pkl['device_brand'] == '(not set)') & (df_out_pkl['device_os'].isna())), 'device_browser'].unique()

array(['Chrome'], dtype=object)

In [98]:
df_out_pkl.loc[((df_out_pkl['device_brand'] == '(not set)') & (df_out_pkl['device_os'].isna())), 'device_category'].unique()

array(['desktop'], dtype=object)

Очевидно, что не определённые бренды в категории desktop - это обычные ПК, поэтому заполним их device_os самой часто встречаемой ОС

In [99]:
df_out_pkl.loc[((df_out_pkl['device_category'] == 'desktop') & ((df_out_pkl['device_brand'] == '') | (df_out_pkl['device_brand'] == '(not set)')) & (df_out_pkl['device_os'].isna()) & (df_out_pkl['device_browser'] != 'Safari')), 'device_os'] = top_device_os_d

In [100]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_brand'].unique()

array(['China Phone', '', 'Flylion', 'Itoos', 'Walton', 'Xiaomi',
       'Xiaolajiao', 'Condor', 'Razer', 'Fujitsu', 'PPTV', 'Huawei',
       'Samsung', 'RCA', 'Cube', 'AT&T', 'Tonbux', 'Nokia', 'T-Mobile',
       'Smartfren', 'KingSing', 'Dragon Touch', 'Fero', 'Land Rover',
       'Tanix', 'Asus', 'Mlais', 'Beelink', 'Orbic', 'OPPO', 'Honeywell',
       'RED', 'Sonim', 'Jiayu', 'Leegoog', 'Star', 'Ellipsis', 'LTC',
       'Motive', 'OnePlus', 'M-HORSE', 'Winnovo', 'Nomi', 'Philips',
       'Wings Mobile', 'How', 'Maze', 'Realme', 'Tagital', 'Maxvi'],
      dtype=object)

In [101]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_brand'].describe()

count     318
unique     50
top          
freq      212
Name: device_brand, dtype: object

In [102]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_category'].unique()

array(['mobile', 'tablet', 'desktop'], dtype=object)

In [103]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_category'].describe()

count        318
unique         3
top       mobile
freq         248
Name: device_category, dtype: object

In [104]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_browser'].unique()

array(['Chrome', 'Opera', 'YaBrowser', 'Firefox', 'UC Browser'],
      dtype=object)

Оставшиеся мобильные устройства, чьи бренды известны, работают под управлением ОС семейства Android, поэтому заполняем их device_os следующим образом

In [105]:
df_out_pkl.loc[(((df_out_pkl['device_category'] == 'mobile') | (df_out_pkl['device_category'] == 'tablet')) & (df_out_pkl['device_brand'] != '') & (df_out_pkl['device_os'].isna())), 'device_os'] = top_device_os_m

In [106]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_brand'].unique()

array(['', 'Xiaomi', 'Huawei', 'Samsung', 'Nokia', 'Asus', 'Beelink',
       'OPPO', 'OnePlus', 'Philips', 'Realme'], dtype=object)

In [107]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_category'].unique()

array(['mobile', 'desktop', 'tablet'], dtype=object)

In [108]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()), 'device_category'].describe()

count        251
unique         3
top       mobile
freq         197
Name: device_category, dtype: object

In [109]:
df_out_pkl.loc[(df_out_pkl['device_os'].isna()) & (df_out_pkl['device_category'] == 'desktop')]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
130636,1338874692298414890.1630870317.1630870317,311731056.163087,2021-09-05,22:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,XHNUiSKKGTEpSAlaVMFQ,,puhZPIYqKXeFPaUviSjo,desktop,,Xiaomi,qBRdfuuhOnnqwSqNiPOv,360x760,Chrome,Russia,Novorossiysk
218862,173540233991179810.1632128548.1632128548,40405484.16321285,2021-09-20,12:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,XHNUiSKKGTEpSAlaVMFQ,,puhZPIYqKXeFPaUviSjo,desktop,,Huawei,AuMdmADEIoPXiWpTsBEj,360x780,Chrome,Russia,Nizhny Novgorod
248442,1868400145061735811.1622240641.1622240641,435020808.16222405,2021-05-29,01:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,XKsYZiUFcdkUXQpoLKyS,puhZPIYqKXeFPaUviSjo,desktop,,Huawei,AuMdmADEIoPXiWpTsBEj,360x770,Chrome,Russia,Stavropol
264199,1939665880277002633.1624975755.1624975755,451613655.1624976,2021-06-29,17:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,zPJpddwzkFqLMSYgtDqy,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,,Samsung,qBRdfuuhOnnqwSqNiPOv,360x760,YaBrowser,Russia,Moscow
264201,1939665880277002633.1624975821.1624975821,451613655.1624976,2021-06-29,17:00:00,3,kjsLglQLzykiRbcDiGcD,cpc,zPJpddwzkFqLMSYgtDqy,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,,Samsung,qBRdfuuhOnnqwSqNiPOv,360x760,YaBrowser,Russia,Moscow
318022,2179795865415458772.1626611671.1626611671,507523274.1626612,2021-07-18,15:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,XHNUiSKKGTEpSAlaVMFQ,,puhZPIYqKXeFPaUviSjo,desktop,,Xiaomi,qBRdfuuhOnnqwSqNiPOv,360x720,Opera,Russia,Saratov
332623,224549194041055047.1634391882.1634391882,52281933.16343919,2021-10-16,16:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,RoDitORHdzGfGhNCyEMy,,puhZPIYqKXeFPaUviSjo,desktop,,Nokia,qBRdfuuhOnnqwSqNiPOv,412x915,Chrome,Russia,Moscow
404371,2564801024506496802.1631881004.1631881004,597164273.1631881,2021-09-17,15:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,,Huawei,AuMdmADEIoPXiWpTsBEj,360x592,Chrome,Russia,Saint Petersburg
449802,2763246423595009915.1631569785.1631569785,643368443.163157,2021-09-14,00:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,agnCWMgbwJZgTVVsuCLg,eOWmIGTKVDPewucDtZXG,puhZPIYqKXeFPaUviSjo,desktop,,Huawei,AuMdmADEIoPXiWpTsBEj,534x854,Chrome,Russia,Moscow
524383,309992255509729444.1623691510.1623691510,72175696.16236913,2021-06-14,20:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,JwYIveaHVpeeRZloQCfF,,puhZPIYqKXeFPaUviSjo,desktop,,Asus,AuMdmADEIoPXiWpTsBEj,360x720,Chrome,Russia,Moscow


Оставшиеся устройства - планшеты, смартфоны и ноутбуки. Не имея иных сведений, положим им самые часто встречаемые значения для этих категорий

In [110]:
df_out_pkl.loc[(((df_out_pkl['device_category'] == 'mobile') | (df_out_pkl['device_category'] == 'tablet')) & (df_out_pkl['device_os'].isna())), 'device_os'] = top_device_os_m

In [111]:
df_out_pkl.loc[((df_out_pkl['device_category'] == 'desktop') & (df_out_pkl['device_os'].isna())), 'device_os'] = top_device_os_d

In [112]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
utm_adcontent               18.046018
utm_campaign                11.807790
session_id                   0.000000
device_category              0.000000
geo_country                  0.000000
device_browser               0.000000
device_screen_resolution     0.000000
device_model                 0.000000
device_brand                 0.000000
device_os                    0.000000
utm_keyword                  0.000000
client_id                    0.000000
utm_medium                   0.000000
utm_source                   0.000000
visit_number                 0.000000
visit_time                   0.000000
visit_date                   0.000000
geo_city                     0.000000
dtype: float64


##### Обработка значений utm_adcontent и utm_campaign

In [113]:
print('Количество пропущенных значений:', df_out_pkl['utm_adcontent'].isna().sum())

Количество пропущенных значений: 335598


In [114]:
df_out_pkl['utm_adcontent'].describe()

count                  1524081
unique                     286
top       JNHcPlZPxEMWDnRiyoBf
freq                   1006266
Name: utm_adcontent, dtype: object

Заполним utm_adcontent и utm_campaign на основе значений utm_medium, поскольку он заполнен без NaN

In [115]:
tmp_df = df_out_pkl.copy()
tmp_df = change_nans(tmp_df, 'utm_medium', 'utm_adcontent')

In [116]:
tmp_df.loc[(tmp_df['utm_adcontent'].isna())]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
294668,2076063222578022290.1637741232.1637741232,483371136.16276544,2021-11-24,11:07:12,188,ZpYIoDJMcFzVoPFsHGJL,CPM,IKQsApKuPmZqqmhieEgf,,vuriSCpYEnMEbLACpaMZ,desktop,Macintosh,Apple,qBRdfuuhOnnqwSqNiPOv,1920x1080,Chrome,Russia,Moscow
864318,461530448458434508.1637254746.1637254746,107458431.1633962,2021-11-18,19:59:06,29,ZpYIoDJMcFzVoPFsHGJL,CPM,IKQsApKuPmZqqmhieEgf,,vuriSCpYEnMEbLACpaMZ,desktop,Macintosh,Apple,qBRdfuuhOnnqwSqNiPOv,1440x900,Chrome,Russia,Moscow
864320,461530448458434508.1637317682.1637317682,107458431.1633962,2021-11-19,13:28:02,31,ZpYIoDJMcFzVoPFsHGJL,CPM,IKQsApKuPmZqqmhieEgf,,vuriSCpYEnMEbLACpaMZ,desktop,Macintosh,Apple,qBRdfuuhOnnqwSqNiPOv,1440x900,Chrome,Russia,Moscow
864322,461530448458434508.1637651607.1637651607,107458431.1633962,2021-11-23,10:13:27,33,ZpYIoDJMcFzVoPFsHGJL,CPM,IKQsApKuPmZqqmhieEgf,,vuriSCpYEnMEbLACpaMZ,desktop,Macintosh,Apple,qBRdfuuhOnnqwSqNiPOv,1440x900,Chrome,Russia,Moscow
864325,461530448458434508.1637838071.1637838071,107458431.1633962,2021-11-25,14:01:11,39,ZpYIoDJMcFzVoPFsHGJL,CPM,IKQsApKuPmZqqmhieEgf,,vuriSCpYEnMEbLACpaMZ,desktop,Macintosh,Apple,qBRdfuuhOnnqwSqNiPOv,1440x900,Chrome,Russia,Moscow
864326,461530448458434508.1637849095.1637849095,107458431.1633962,2021-11-25,17:04:55,40,ZpYIoDJMcFzVoPFsHGJL,CPM,IKQsApKuPmZqqmhieEgf,,vuriSCpYEnMEbLACpaMZ,desktop,Macintosh,Apple,qBRdfuuhOnnqwSqNiPOv,1440x900,Chrome,Russia,Moscow
864329,461530448458434508.1638180311.1638180311,107458431.1633962,2021-11-29,13:05:11,45,ZpYIoDJMcFzVoPFsHGJL,CPM,IKQsApKuPmZqqmhieEgf,,vuriSCpYEnMEbLACpaMZ,desktop,Macintosh,Apple,qBRdfuuhOnnqwSqNiPOv,1440x900,Chrome,Russia,Moscow
1092638,5632358102873646550.1640590946.1640590946,1311385562.1637065,2021-12-27,10:42:26,47,ZpYIoDJMcFzVoPFsHGJL,CPM,IKQsApKuPmZqqmhieEgf,,vuriSCpYEnMEbLACpaMZ,desktop,Macintosh,Apple,qBRdfuuhOnnqwSqNiPOv,1440x900,Chrome,Russia,Moscow
1092641,5632358102873646550.1640778685.1640778685,1311385562.1637065,2021-12-29,14:51:25,56,ZpYIoDJMcFzVoPFsHGJL,CPM,IKQsApKuPmZqqmhieEgf,,vuriSCpYEnMEbLACpaMZ,desktop,Macintosh,Apple,qBRdfuuhOnnqwSqNiPOv,1440x900,Chrome,Russia,Moscow
1326793,6675031872911888372.1635154650.1635154650,1554151967.1632817,2021-10-25,12:37:30,17,ZpYIoDJMcFzVoPFsHGJL,CPM,IKQsApKuPmZqqmhieEgf,,vuriSCpYEnMEbLACpaMZ,desktop,Macintosh,Apple,qBRdfuuhOnnqwSqNiPOv,1792x1120,Chrome,Russia,Moscow


In [117]:
tmp_df = change_nans(tmp_df, 'utm_medium', 'utm_campaign')

In [118]:
top_uac = df_out_pkl.loc[df_out_pkl['utm_campaign'] == 'IKQsApKuPmZqqmhieEgf']['utm_adcontent'].describe()['top']
top_uac

'NNFDaOyxNbRfjYvClLnM'

In [119]:
tmp_df['utm_adcontent'] = tmp_df['utm_adcontent'].fillna(top_uac)

In [120]:
print_missing_values(tmp_df)

Процент пропущенных значений:
utm_campaign                0.106739
session_id                  0.000000
device_category             0.000000
geo_country                 0.000000
device_browser              0.000000
device_screen_resolution    0.000000
device_model                0.000000
device_brand                0.000000
device_os                   0.000000
utm_keyword                 0.000000
client_id                   0.000000
utm_adcontent               0.000000
utm_medium                  0.000000
utm_source                  0.000000
visit_number                0.000000
visit_time                  0.000000
visit_date                  0.000000
geo_city                    0.000000
dtype: float64


In [121]:
tmp_df.loc[(tmp_df['utm_campaign'].isna())]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
66,905565998839028208.1636951969.1636951969,210843514.1635310064,2021-11-15,07:52:49,2,ISrKoXQCxqqYvAZICvjs,smm,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,OnePlus,AshpvdJdReafUzEMmWGr,384x832,Android Webview,Russia,Saint Petersburg
1375,9061695622495520184.1636746681.1636746681,2109840424.1636746680,2021-11-12,22:51:21,1,ISrKoXQCxqqYvAZICvjs,smm,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,qBRdfuuhOnnqwSqNiPOv,393x818,Android Webview,Russia,Rostov-on-Don
1540,9062530405223407672.1636081721.1636081721,2110034787.1636081720,2021-11-05,06:08:41,1,ISrKoXQCxqqYvAZICvjs,smm,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Samsung,qBRdfuuhOnnqwSqNiPOv,360x640,Android Webview,Russia,Moscow
3383,9070450312033055834.1636807769.1636807769,2111878784.1636807770,2021-11-13,15:49:29,1,ISrKoXQCxqqYvAZICvjs,smm,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,qBRdfuuhOnnqwSqNiPOv,390x844,Safari (in-app),Russia,Moscow
3866,9072568284960109851.1636152602.1636152602,2112371913.1636152603,2021-11-06,01:50:02,1,ISrKoXQCxqqYvAZICvjs,smm,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,qBRdfuuhOnnqwSqNiPOv,414x896,Safari (in-app),Russia,Dolgoprudny
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856295,9039146172313218893.1636806480.1636806480,2104590221.1636806477,2021-11-13,15:28:00,1,ISrKoXQCxqqYvAZICvjs,smm,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,qBRdfuuhOnnqwSqNiPOv,393x851,Android Webview,Russia,Rostov-on-Don
1856495,9040123311732516722.1636525939.1636525939,2104817729.1636525938,2021-11-10,09:32:19,1,iNFgfQPqHPBuvGCYtrQE,smm,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,qBRdfuuhOnnqwSqNiPOv,320x568,Safari,Russia,Nizhny Novgorod
1858858,9050343250838039348.1636707124.1636707124,2107197244.1636707124,2021-11-12,11:52:04,1,ISrKoXQCxqqYvAZICvjs,smm,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,tablet,iOS,Apple,qBRdfuuhOnnqwSqNiPOv,375x667,Safari (in-app),Russia,Moscow
1859720,9054155068672225828.1635955238.1635955238,2108084752.1635955236,2021-11-03,19:00:38,1,QxAxdyPLuQMEcrdZWdWb,smm,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,iOS,Apple,qBRdfuuhOnnqwSqNiPOv,2000x2000,Safari,Sweden,Lulea


In [122]:
tmp_df = change_nans(tmp_df, 'utm_source', 'utm_campaign')

In [123]:
df_out_pkl = tmp_df.copy()

In [124]:
print_missing_values(df_out_pkl)

Процент пропущенных значений:
session_id                  0.0
client_id                   0.0
geo_country                 0.0
device_browser              0.0
device_screen_resolution    0.0
device_model                0.0
device_brand                0.0
device_os                   0.0
device_category             0.0
utm_keyword                 0.0
utm_adcontent               0.0
utm_campaign                0.0
utm_medium                  0.0
utm_source                  0.0
visit_number                0.0
visit_time                  0.0
visit_date                  0.0
geo_city                    0.0
dtype: float64


#### Исследование и обработка данных