[Reference](https://medium.com/nerd-for-tech/5-python-pandas-skills-for-eda-the-most-googled-questions-ive-ever-made-61b6a684d879)

# Filling/dropping missing values(or duplicated values)

## 1. Filling missing values depend on the type of data AT ONCE.

In [5]:
import pandas as pd
import numpy as np

In [6]:
dict_ = {'name': ['a','b','c','d','e'],
         'class': ['foo','foo','bar','bar','foo'],
         'isAvailable': pd.Series([True, True, False, np.nan, False], dtype='boolean'),
         'mid_term':['A','B',np.nan,'C','C'],
         'quiz':[1, np.nan, np.nan, 0.5, np.nan],
         'final_term':[np.nan, 'C', 'C', np.nan, 'A']}

df = pd.DataFrame(dict_)
df

Unnamed: 0,name,class,isAvailable,mid_term,quiz,final_term
0,a,foo,True,A,1.0,
1,b,foo,True,B,,C
2,c,bar,False,,,C
3,d,bar,,C,0.5,
4,e,foo,False,C,,A


In [8]:
df.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna(""))

Unnamed: 0,name,class,isAvailable,mid_term,quiz,final_term
0,a,foo,True,A,1.0,
1,b,foo,True,B,0.0,C
2,c,bar,False,,0.0,C
3,d,bar,False,C,0.5,
4,e,foo,False,C,0.0,A


## 2. Dropping duplicated values and keeping a part

In [10]:
date = pd.date_range('2021-04-01', periods=5, freq='D')
df['date'] = date

In [11]:
df.drop_duplicates('class',keep='first')

Unnamed: 0,name,class,isAvailable,mid_term,quiz,final_term,date
0,a,foo,True,A,1.0,,2021-04-01
2,c,bar,False,,,C,2021-04-03


In [13]:
df.drop_duplicates('class', keep='last')

Unnamed: 0,name,class,isAvailable,mid_term,quiz,final_term,date
3,d,bar,,C,0.5,,2021-04-04
4,e,foo,False,C,,A,2021-04-05


# Replacing value with lambda
## 3. Replacing values based on conditions.

In [14]:
df_cross = pd.crosstab(df['class'], df['mid_term'])
df_cross

mid_term,A,B,C
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,0,0,1
foo,1,1,1


In [16]:
df_cross.apply(lambda x: [True if y==1 else False for y in x])

mid_term,A,B,C
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,False,False,True
foo,True,True,True


In [17]:
assign = lambda x: [True if y==1 else False for y in x]
list(map(assign, np.array(df_cross)))

[[False, False, True], [True, True, True]]

# Reshaping DataFrame with melt, pivot_table, and crosstab
## 4. Converting some columns into rows.

In [18]:
cols = ['name', 'class', 'mid_term', 'final_term']
df['mid_term'].replace({'A':100, 'B':80, 'C': 50}, inplace=True)
df['final_term'].replace({'A':100, 'B':80, 'C': 50}, inplace=True)
df_wide = df[cols]
df_wide

Unnamed: 0,name,class,mid_term,final_term
0,a,foo,100.0,
1,b,foo,80.0,50.0
2,c,bar,,50.0
3,d,bar,50.0,
4,e,foo,50.0,100.0


In [19]:
df_tidy = df_wide.melt(id_vars=['name','class'], var_name='term_kind', value_name = 'values')
df_tidy

Unnamed: 0,name,class,term_kind,values
0,a,foo,mid_term,100.0
1,b,foo,mid_term,80.0
2,c,bar,mid_term,
3,d,bar,mid_term,50.0
4,e,foo,mid_term,50.0
5,a,foo,final_term,
6,b,foo,final_term,50.0
7,c,bar,final_term,50.0
8,d,bar,final_term,
9,e,foo,final_term,100.0


In [20]:
df_tidy.pivot_table(index=['name','class'],columns='term_kind',values='values')

Unnamed: 0_level_0,term_kind,final_term,mid_term
name,class,Unnamed: 2_level_1,Unnamed: 3_level_1
a,foo,,100.0
b,foo,50.0,80.0
c,bar,50.0,
d,bar,,50.0
e,foo,100.0,50.0


## 5. Frequency table with crosstab()

In [21]:
region = ['Seoul', 'Suwon', 'Incheon', 'Suwon', 'Seoul']
df_wide['region'] = region
df_wide

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,name,class,mid_term,final_term,region
0,a,foo,100.0,,Seoul
1,b,foo,80.0,50.0,Suwon
2,c,bar,,50.0,Incheon
3,d,bar,50.0,,Suwon
4,e,foo,50.0,100.0,Seoul


In [22]:
pd.crosstab(df_wide['class'], df_wide['region'])

region,Incheon,Seoul,Suwon
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,1,0,1
foo,0,2,1


In [23]:
pd.crosstab(df_wide['class'], df_wide['region'], margins=True)

region,Incheon,Seoul,Suwon,All
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,1,0,1,2
foo,0,2,1,3
All,1,2,2,5
