### PANDAS, PYTHON

# How to use Pandas Count and Value_Counts

[Website](https://kanoki.org/2020/03/09/how-to-use-pandas-count-and-value_counts/)

In [1]:
import numpy as np
import pandas as pd

In [34]:
idx = pd.MultiIndex.from_tuples([('Chris',48), ('Brian',np.nan), ('David',65),('Chris',34),('John',28)], names=['Name', 'Age'])
col = ['Salary']

df = pd.DataFrame([120000, 140000, 90000, 101000, 59000], idx, col)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Salary
Name,Age,Unnamed: 2_level_1
Chris,48.0,120000
Brian,,140000
David,65.0,90000
Chris,34.0,101000
John,28.0,59000


In [35]:
df.groupby(level='Name').count()

Unnamed: 0_level_0,Salary
Name,Unnamed: 1_level_1
Brian,1
Chris,2
David,1
John,1


In [36]:
df=df.reset_index()

In [37]:
df.groupby(by='Name').count()

Unnamed: 0_level_0,Age,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Brian,0,1
Chris,2,2
David,1,1
John,1,1


In [38]:
df.groupby(by='Name').agg('count')

Unnamed: 0_level_0,Age,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Brian,0,1
Chris,2,2
David,1,1
John,1,1


In [39]:
df['freq']=df.groupby(by='Name')['Name'].transform('count')
df

Unnamed: 0,Name,Age,Salary,freq
0,Chris,48.0,120000,2
1,Brian,,140000,1
2,David,65.0,90000,1
3,Chris,34.0,101000,2
4,John,28.0,59000,1


In [41]:
df['Name'].map(df['Name'].value_counts())

0    2
1    1
2    1
3    2
4    1
Name: Name, dtype: int64

In [17]:
df['Name'].value_counts()

Chris    2
Brian    1
David    1
John     1
Name: Name, dtype: int64

In [18]:
#sort by frequency
df['Name'].value_counts(sort=True)

Chris    2
Brian    1
David    1
John     1
Name: Name, dtype: int64

In [19]:
# sort by ascending
df['Name'].value_counts(sort=True, ascending=True)

Brian    1
David    1
John     1
Chris    2
Name: Name, dtype: int64

In [20]:
# Relative counts - find percentage
df['Name'].value_counts(normalize=True)

Chris    0.4
Brian    0.2
David    0.2
John     0.2
Name: Name, dtype: float64

In [21]:
df['Salary'].value_counts(bins=2)

(99500.0, 140000.0]     3
(58918.999, 99500.0]    2
Name: Salary, dtype: int64

In [23]:
df = pd.DataFrame(np.random.randint(0, 2, (5, 3)), columns=["A", "B","C"])
df

Unnamed: 0,A,B,C
0,0,0,0
1,1,0,0
2,0,0,1
3,1,1,1
4,0,0,0


In [24]:
df.apply(pd.Series.value_counts, axis=1)

Unnamed: 0,0,1
0,3.0,
1,2.0,1.0
2,2.0,1.0
3,,3.0
4,3.0,


In [25]:
df.apply(pd.Series.value_counts, axis=0)

Unnamed: 0,A,B,C
0,3,4,3
1,2,1,2


In [26]:
df1 = df.melt(var_name='columns', value_name='values')
pd.crosstab(index=df1['values'], columns=df1['columns'])

columns,A,B,C
values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3,4,3
1,2,1,2


In [29]:
# By column
df[df == 1].sum(axis=0)

A    2.0
B    1.0
C    2.0
dtype: float64

In [30]:
# By row
df[df == 1].sum(axis=1)

0    0.0
1    1.0
2    1.0
3    3.0
4    0.0
dtype: float64

In [111]:
# dictionary of lists
dict = {'year':["2020", "2021", "2020", "2019","2018"],
        'action': [1, 1, 0.0, 0, 1],
        'comedy':[1, 0, 1, 0, 1]}
df = pd.DataFrame(dict)
df = df.astype({"action": np.int32, "comedy": np.int32}, errors='ignore')
df = df.set_index('year')
df

Unnamed: 0_level_0,action,comedy
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2020,1,1
2021,1,0
2020,0,1
2019,0,0
2018,1,1


In [116]:
byrow = df[df == 1].sum(axis=1)
byrow

year
2020    2.0
2021    1.0
2020    1.0
2019    0.0
2018    2.0
dtype: float64

In [117]:
bycol = df[df == 1].sum(axis=0)
bycol

action    3.0
comedy    3.0
dtype: float64

In [95]:
grp = df.groupby(by='year')
grp.groups

{'2018': [4], '2019': [3], '2020': [0, 2], '2021': [1]}

In [101]:
sum = grp.sum()
sum

Unnamed: 0_level_0,action,comedy
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,1,1
2019,0,0
2020,1,2
2021,1,0


In [65]:
df.groupby(by='year').agg('sum')

Unnamed: 0_level_0,action,comedy
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,1,1
2019,0,0
2020,1,2
2021,1,0


In [66]:
df.groupby(by='year')['comedy'].agg('sum')

year
2018    1
2019    0
2020    2
2021    0
Name: comedy, dtype: int32

In [67]:
df.groupby(by='year')['comedy'].value_counts()

year  comedy
2018  1         1
2019  0         1
2020  1         2
2021  0         1
Name: comedy, dtype: int64

In [68]:
df.groupby(by='year')['comedy'].count()

year
2018    1
2019    1
2020    2
2021    1
Name: comedy, dtype: int64

In [69]:
df.groupby(by='year').count()

Unnamed: 0_level_0,action,comedy
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,1,1
2019,1,1
2020,2,2
2021,1,1


In [70]:
df['year'].value_counts(sort=True)

2020    2
2021    1
2019    1
2018    1
Name: year, dtype: int64

In [102]:
row_sum = df[df == 1].sum(axis=1)
row_sum

0    2.0
1    1.0
2    1.0
3    0.0
4    2.0
dtype: float64

In [105]:
df['row_sum'] = df.loc[:,:].sum(axis = 1)
df

  df['row_sum'] = df.loc[:,:].sum(axis = 1)


Unnamed: 0,year,action,comedy,row_sum,Sum
0,2020,1,1,6,2
1,2021,1,0,3,1
2,2020,0,1,3,1
3,2019,0,0,0,0
4,2018,1,1,6,2


In [103]:
df['row_sum'] = df.loc[:,['action' , 'comedy']].sum(axis = 1)
df

Unnamed: 0,year,action,comedy,row_sum
0,2020,1,1,2
1,2021,1,0,1
2,2020,0,1,1
3,2019,0,0,0
4,2018,1,1,2


In [104]:
df = df.loc[:].eval('Sum = action + comedy')
df

Unnamed: 0,year,action,comedy,row_sum,Sum
0,2020,1,1,2,2
1,2021,1,0,1,1
2,2020,0,1,1,1
3,2019,0,0,0,0
4,2018,1,1,2,2


In [108]:
grp = df.groupby('year')
grp.filter(lambda x: len(x) >= 2)

Unnamed: 0,year,action,comedy,row_sum,Sum
0,2020,1,1,6,2
2,2020,0,1,3,1


In [109]:
# using transform function
grp = df.groupby('year')
sc = lambda x: (x - x.mean()) / x.std()*10
grp.transform(sc)

Unnamed: 0,action,comedy,row_sum,Sum
0,7.071068,,7.071068,7.071068
1,,,,
2,-7.071068,,-7.071068,-7.071068
3,,,,
4,,,,
