In [1]:
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore') # Para evitar los molestos avisos.
%matplotlib inline

*La explicación sobre el origen de los datos y sus características se encuentra en el domcunedo word 'Tesis'.*

Lectura del dataset a analizar, generado en el notebook "Datos".   
Solo label género (2 y 3)

In [11]:
colnames = ['device_idx','label','country','feature_type','feature_detail']
df_demo = pd.read_csv('df_dev_label_coun_feature.to_csv', usecols=colnames) 
df_demo = df_demo[df_demo['country']=='AR']
#df_demo = df_demo[(df_demo['label']==2) | (df_demo['label']==3)] #levantar todo, ya que la edad es un feature que me servirá
df_demo = df_demo.dropna(subset=['label'])
print('Dataset Size:',df_demo.shape)
print('Labels:',df_demo['label'].unique())
df_demo.tail()

Dataset Size: (4911638, 5)
Labels: [5. 2. 6. 9. 3. 7. 4. 8.]


Unnamed: 0,device_idx,label,country,feature_type,feature_detail
4912523,101862.0,2.0,AR,dom,minutouno.com
4912524,101862.0,2.0,AR,url,minutouno.com/notas/5157926-la-emocion-los-tri...
4912525,101862.0,2.0,AR,dom,creditosparatodos.org
4912526,101862.0,2.0,AR,url,creditosparatodos.org/gracias.php
4912527,101862.0,2.0,AR,ov,Android 9


In [12]:
tmp_gender = df_demo[(df_demo['label']==2) | (df_demo['label']==3)]
tmp_gender.shape

(2456253, 5)

In [14]:
tmp_age = df_demo[(df_demo['label']==4) | (df_demo['label']==5)|(df_demo['label']==6) | (df_demo['label']==7)|\
                 (df_demo['label']==8) | (df_demo['label']==9)]
tmp_age.shape

(2455385, 5)

In [22]:
#Para obtener la edad, mergeo ambos tmps
df_demo = pd.merge(tmp_gender, tmp_age, how='inner', on=['device_idx','country','feature_type','feature_detail'])
print(df_demo.shape)
df_demo = df_demo.rename(columns={'label_x':'label',
                                 'label_y':'age'})
df_demo.head()

(2455374, 6)


Unnamed: 0,device_idx,label,country,feature_type,feature_detail,age
0,392.0,2.0,AR,db,XiaoMi,6.0
1,392.0,2.0,AR,url,bumeran.com.ar/empleos-busqueda-legales.html,6.0
2,392.0,2.0,AR,bf,Chrome,6.0
3,392.0,2.0,AR,ov,Android 9,6.0
4,392.0,2.0,AR,url,bumeran.com.ar/empleos/analista-de-legales-pas...,6.0


### DATA CLEANSING

Label género:   
    hombre--> 2   
    mujer--> 3

In [23]:
df_demo['label'].unique()

array([2., 3.])

Me aseguro de que cada dispositivo tenga sólo un label. De no ser así, elimino al dispositivo.   

In [24]:
df_demo.groupby(['device_idx']).nunique()['label'].unique()

array([1], dtype=int64)

Descarto tipo de features que no aportan.

In [25]:
print('Actualmente tengo estos tipos de features: ')
df_demo['feature_type'].unique()

Actualmente tengo estos tipos de features: 


array(['db', 'url', 'bf', 'ov', 'im', 'of', 'dom', 'df', 'oa', 'ip',
       'utm', 'it'], dtype=object)

Para la data que proviene de UA los tipo de features son:

* of = OS (sistema operativo)
* bf = browser (navegador)
* df = modelo del celular 
* ov = OS version del sistema operativo
* ip = is pc: el feature value vale siempre 1
* db = device brand, marca del dispositivo
* im = is mobile: el feature value vale siempre 1
* it = is tablet: el feature value vale siempre 1 

Los tipos de features it, ip, im son indicatrices

In [26]:
print(df_demo[df_demo['feature_type']=='it']['feature_detail'].unique())
print(df_demo[df_demo['feature_type']=='ip']['feature_detail'].unique())
print(df_demo[df_demo['feature_type']=='im']['feature_detail'].unique())

['1']
['1']
['1']


El feature 'utm' no nos dice nada.

In [27]:
df_demo[df_demo['feature_type']=='utm']['feature_detail'].unique()

array(['mail'], dtype=object)

In [28]:
df_demo[df_demo['feature_type']=='oa']['feature_detail'].unique()

array(['old', 'new', 'ancient'], dtype=object)

In [34]:
df_demo.feature_type.value_counts()

url    1576377
dom     381200
of       88742
bf       88592
oa       85684
ov       85306
ip       50091
im       38216
db       29607
df       21150
utm       9756
it         653
Name: feature_type, dtype: int64

In [36]:
df_demo = df_demo[(df_demo['feature_type']!='utm')]
print(df_demo['feature_type'].unique())

['db' 'url' 'bf' 'ov' 'im' 'of' 'dom' 'df' 'oa' 'ip' 'it']


El feature type 'df' (modelo de dispositivo) tiene un valor que no tiene sentido--> 'wv'

In [37]:
df_demo = df_demo[(df_demo['feature_detail']!='wv')]
df_demo[df_demo['feature_type']=='df']['feature_detail'].unique()

array(['XiaoMi Redmi Note 8', 'moto e5 play', 'Samsung SM-J710MN',
       'Samsung SM-G532M', 'iPhone', 'Samsung SM-A105M',
       'Samsung SM-J701M', 'Samsung SM-A505G', 'Samsung SM-G9600',
       'Samsung SM-A520F', 'Samsung SM-A307G', 'Samsung SM-G9650',
       'Samsung SM-A515F', 'Samsung SM-A705MN', 'Samsung SM-G610M',
       'Moto G (5S', 'Samsung SM-J400M', 'Samsung SM-A205G',
       'Samsung SM-G570M', 'Samsung SM-G975F', 'LG-M250',
       'Samsung SM-J600G', 'Samsung SM-A217M', 'Samsung SM-A715F',
       'Samsung SM-A305G', 'Samsung SM-A015M', 'XiaoMi Redmi Note 7',
       'Moto G (5', 'Samsung SM-A107M', 'Samsung SM-J700M',
       'Samsung SM-J415G', 'moto e5', 'Samsung SM-J260M', 'Moto C',
       'Samsung SM-A315G', 'Samsung SM-G955F', 'Samsung SM-J610G', 'Mac',
       'Samsung SM-A207M', 'Samsung SM-G950F', 'Samsung SM-A115M',
       'Moto E (4', 'Samsung SM-G935F'], dtype=object)

Reviso valores de cada tipo de feature.

In [38]:
print('Modelos de dispositivos: \n',df_demo[df_demo['feature_type']=='df']['feature_detail'].unique())
print('Marcas de dispositivos: \n',df_demo[df_demo['feature_type']=='db']['feature_detail'].unique())
print('Navegadores de dispositivos: \n',df_demo[df_demo['feature_type']=='bf']['feature_detail'].unique())
print('Sistemas Operativos de dispositivos: \n',df_demo[df_demo['feature_type']=='of']['feature_detail'].unique())
print('Versiones de Sistema Operativo de dispositivos: \n',df_demo[df_demo['feature_type']=='ov']['feature_detail'].unique())
print('Antiguedad de dispositivo: \n',df_demo[df_demo['feature_type']=='oa']['feature_detail'].unique())
print('Dominios por los que navegan dispositivos: \n',df_demo[df_demo['feature_type']=='url']['feature_detail'].unique())
print('Urls por los que navegan dispositivos: \n',df_demo[df_demo['feature_type']=='dom']['feature_detail'].unique())

Modelos de dispositivos: 
 ['XiaoMi Redmi Note 8' 'moto e5 play' 'Samsung SM-J710MN'
 'Samsung SM-G532M' 'iPhone' 'Samsung SM-A105M' 'Samsung SM-J701M'
 'Samsung SM-A505G' 'Samsung SM-G9600' 'Samsung SM-A520F'
 'Samsung SM-A307G' 'Samsung SM-G9650' 'Samsung SM-A515F'
 'Samsung SM-A705MN' 'Samsung SM-G610M' 'Moto G (5S' 'Samsung SM-J400M'
 'Samsung SM-A205G' 'Samsung SM-G570M' 'Samsung SM-G975F' 'LG-M250'
 'Samsung SM-J600G' 'Samsung SM-A217M' 'Samsung SM-A715F'
 'Samsung SM-A305G' 'Samsung SM-A015M' 'XiaoMi Redmi Note 7' 'Moto G (5'
 'Samsung SM-A107M' 'Samsung SM-J700M' 'Samsung SM-J415G' 'moto e5'
 'Samsung SM-J260M' 'Moto C' 'Samsung SM-A315G' 'Samsung SM-G955F'
 'Samsung SM-J610G' 'Mac' 'Samsung SM-A207M' 'Samsung SM-G950F'
 'Samsung SM-A115M' 'Moto E (4' 'Samsung SM-G935F']
Marcas de dispositivos: 
 ['XiaoMi' 'Huawei' 'Samsung' 'Apple' 'LG' 'Motorola']
Navegadores de dispositivos: 
 ['Chrome' 'Edge' 'SocialApp' 'Google' 'Safari' 'Brand_browser' 'Firefox'
 'IE']
Sistemas Operativos

Características del dataset.

In [43]:
print(df_demo.shape[0], 'filas para Argentina')
print('Shape: \n',df_demo.shape)
print('Cantidad de dispositivos únicos: \n',len(df_demo['device_idx'].unique()))
print('Cantidad de tipo de features únicos: \n',len(df_demo['feature_type'].unique()),'Siendo:',df_demo['feature_type'].unique())
print('Cantidad de features únicos: \n',len(df_demo['feature_detail'].unique()))
df_demo.head(3)

2445214 filas para Argentina
Shape: 
 (2445214, 6)
Cantidad de dispositivos únicos: 
 99693
Cantidad de tipo de features únicos: 
 11 Siendo: ['db' 'url' 'bf' 'ov' 'im' 'of' 'dom' 'df' 'oa' 'ip' 'it']
Cantidad de features únicos: 
 31976


Unnamed: 0,device_idx,label,country,feature_type,feature_detail,age
0,392.0,2.0,AR,db,XiaoMi,6.0
1,392.0,2.0,AR,url,bumeran.com.ar/empleos-busqueda-legales.html,6.0
2,392.0,2.0,AR,bf,Chrome,6.0


Guardo Dataset Género.

In [40]:
df_demo.to_csv('df_demografico_ar.csv')