In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
DATA_DIR = "data/gender.parquet"

In [3]:
df = pd.read_parquet(DATA_DIR, engine="pyarrow")
df.head()

Unnamed: 0,queries,apps,games,gender,birth_year
0,"[216, 359, 12329, 3, 45, 4002, 2066, 32, 3931,...","[4, 4, 25, 7, 30, 58, 16, 19, 17, 21, 10, 10, ...","[9151, 208]",M,1366.0
1,[],"[129, 71, 9, 8, 11, 25, 18, 58, 6, 16, 125, 12...","[460, 4939, 14, 232, 6387, 1758, 5834, 3, 2]",F,1359.0
2,"[23463, 18831]","[9, 174, 65, 8, 63, 97, 62, 103, 61, 116, 59, ...","[448, 723, 267, 9064, 10634, 166, 782, 224, 27...",M,1373.0
3,"[1634, 3609, 654]","[99, 73, 9, 8, 59, 37, 131, 3, 89, 6, 24, 16, ...","[78, 2607, 478, 435, 9, 192]",M,0.0
4,"[11064, 227, 623, 1301, 43999, 35411, 2492, 11...","[9, 8, 17, 54, 3, 25, 22, 6, 21, 7, 14, 5, 817...","[1702, 1, 53]",M,1364.0


In [4]:
df.tail()

Unnamed: 0,queries,apps,games,gender,birth_year
40266,"[13388, 10571, 122, 1961, 42946, 823, 3349, 10...","[9, 8, 17, 3, 25, 22, 6, 24, 12, 7, 14, 5, 4, ...","[3, 55, 1115, 135, 410, 38, 1426, 107, 374]",M,1394.0
40267,"[2655, 11, 1732, 2847, 15222, 884, 39, 1433, 2...","[8, 11, 10, 10, 39, 17, 771, 48, 3, 25, 95, 22...",[717],M,1354.0
40268,"[9, 33200, 5028, 357, 4, 233, 262, 2180, 376, ...","[54, 9, 8, 10, 10, 39, 17, 48, 3, 25, 6, 21, 1...","[312, 22]",M,1364.0
40269,"[9, 276, 27, 1074]","[9, 8, 17, 48, 54, 3, 25, 22, 6, 21, 7, 45, 14...","[73, 2, 53, 75]",M,1365.0
40270,"[6421, 11377, 6980, 852, 31, 185, 2348, 534, 4...","[43, 8, 10, 10, 39, 17, 48, 54, 3, 25, 22, 6, ...","[355, 278, 185]",M,1390.0


In [17]:
df.shape

(40271, 5)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40271 entries, 0 to 40270
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   queries     40271 non-null  object 
 1   apps        40271 non-null  object 
 2   games       40271 non-null  object 
 3   gender      40271 non-null  object 
 4   birth_year  40271 non-null  float64
dtypes: float64(1), object(4)
memory usage: 1.5+ MB


In [5]:
df["birth_year"] = df["birth_year"].astype(np.uint)

There is no `NaN` value heare as we can see in `birth_year` we have `0` which is not acceptable. So we should replace them with new values.

In addition to `birth_year` we should make gender column numeric, although it is our goal column.

In [7]:
valid_birth_year = df["birth_year"][df["birth_year"] != 0]
mean_birth_year = round(valid_birth_year.mean())

In [8]:
df["birth_year"] = df["birth_year"].replace(0, mean_birth_year)
df["gender"] = df["gender"].replace({'M': 0, 'F': 1})
df.head()

Unnamed: 0,queries,apps,games,gender,birth_year
0,"[216, 359, 12329, 3, 45, 4002, 2066, 32, 3931,...","[4, 4, 25, 7, 30, 58, 16, 19, 17, 21, 10, 10, ...","[9151, 208]",0,1366
1,[],"[129, 71, 9, 8, 11, 25, 18, 58, 6, 16, 125, 12...","[460, 4939, 14, 232, 6387, 1758, 5834, 3, 2]",1,1359
2,"[23463, 18831]","[9, 174, 65, 8, 63, 97, 62, 103, 61, 116, 59, ...","[448, 723, 267, 9064, 10634, 166, 782, 224, 27...",0,1373
3,"[1634, 3609, 654]","[99, 73, 9, 8, 59, 37, 131, 3, 89, 6, 24, 16, ...","[78, 2607, 478, 435, 9, 192]",0,1371
4,"[11064, 227, 623, 1301, 43999, 35411, 2492, 11...","[9, 8, 17, 54, 3, 25, 22, 6, 21, 7, 14, 5, 817...","[1702, 1, 53]",0,1364


In [9]:
df.describe()

Unnamed: 0,gender,birth_year
count,40271.0,40271.0
mean,0.257009,1370.967495
std,0.43699,11.70662
min,0.0,1300.0
25%,0.0,1364.0
50%,0.0,1371.0
75%,1.0,1380.0
max,1.0,1398.0


## Dealing with list values

As it's clear hear that three of our main columns which are `queries`, `apps`, and `games` are lists so we should desl with them.

Actually, if you look closely, you will find that lists are everywhere! Here are some practical problems, where you will probably encounter list values.

* Audio-video tags
* Open-ended questions in survey data
* List of all authors, artists, producers, etc. invloved in a creative product

### What is wrong with list values?
List values mess up everything you know about data analysis. The simplest operations can not be performed without endless looping. 

An example of this is here:

![image.png](attachment:image.png)

![image-2.png](attachment:image-2.png)

The reason this does not work is that Pandas does not have direct access to every individual element of the lists. Thus, Pandas is unable to apply functions like value_counts() properly. 

In [14]:
def to_1D(series):
    return pd.Series([x for _list in series for x in _list])

In [15]:
to_1D(df['queries']).value_counts()

2        8229
3        8174
4        7870
1        7816
5        6605
         ... 
43591       1
49796       1
36214       1
37550       1
41955       1
Length: 47287, dtype: int64

In [16]:
to_1D(df['apps']).value_counts()

4        41154
1        40271
2        40194
3        38390
10       37980
         ...  
22089        1
19999        1
16220        1
12370        1
30151        1
Length: 36598, dtype: int64

In [17]:
to_1D(df['games']).value_counts()

2        5534
1        3662
9        2324
4        2131
16       2045
         ... 
3942        1
30718       1
8524        1
8361        1
14970       1
Length: 15414, dtype: int64

In [None]:
fig, ax = plt.subplots(figsize = (14,4))
ax.bar(to_1D(df["queries"]).value_counts().index,
        to_1D(df["favorite_fruits"]).value_counts().values)
ax.set_ylabel("Frequency", size = 12)
ax.set_title("Children's Favorite Fruits", size = 14)