# Set_index

In [1]:
import pandas as pd

In [122]:
data = {'name': ['a','b','c'], 'age':[21,43,67]}
df = pd.DataFrame(data, columns = ['name', 'age'])
df

Unnamed: 0,name,age
0,a,21
1,b,43
2,c,67


In [120]:
df.set_index('name')

Unnamed: 0_level_0,age
name,Unnamed: 1_level_1
a,21
b,43
c,67


In [124]:
index = pd.Index(['x1', 'x2', 'x3'])
df.set_index(index,inplace = True)

# add row 

In [125]:
df.index

Index(['x1', 'x2', 'x3'], dtype='object')

In [126]:
df.loc[len(df.index)] = ['e',41]

In [127]:
df

Unnamed: 0,name,age
x1,a,21
x2,b,43
x3,c,67
3,e,41


# Quantile

In [26]:
import numpy as np
import random

In [33]:
A = [random.randint(a=0, b = 100) for i in np.arange(10)]
B = [random.randint(a=0, b = 100) for i in np.arange(10)]
df = pd.DataFrame({"A" : A, "B":B})
df

Unnamed: 0,A,B
0,36,63
1,59,47
2,70,12
3,67,79
4,98,97
5,84,37
6,97,89
7,30,17
8,42,37
9,0,71


In [34]:
df.A.quantile(0.1)

27.0

# Splite a dataframe by Boolian Criterion

In [35]:
df[df['A'] > 10]

Unnamed: 0,A,B
0,36,63
1,59,47
2,70,12
3,67,79
4,98,97
5,84,37
6,97,89
7,30,17
8,42,37


# convert numpy to dataframe

In [36]:
arr = np.array([[115, 222, 343],[323, 242, 356]])

In [40]:
df = pd.DataFrame(arr, columns = ['a','b','c'], index= ['x1', 'x2'])
df

Unnamed: 0,a,b,c
x1,115,222,343
x2,323,242,356


# drop a row in dataframe

In [41]:
df.drop('x1')

Unnamed: 0,a,b,c
x2,323,242,356


In [42]:
df1 = pd.DataFrame(arr, columns = ['a','b','c'])
df1

Unnamed: 0,a,b,c
0,115,222,343
1,323,242,356


In [48]:
index = pd.Index(['x1','x2'])

df1.set_index(index, inplace = True)
df1

Unnamed: 0,a,b,c
x1,115,222,343
x2,323,242,356


In [49]:
df1.drop('x2')

Unnamed: 0,a,b,c
x1,115,222,343


# drop a cloumn

In [50]:
df1.drop('a',axis = 1)

Unnamed: 0,b,c
x1,222,343
x2,242,356


In [52]:
pd.value_counts(df1['a'])

115    1
323    1
Name: a, dtype: int64

# reset index

In [58]:
df1.set_index('a', inplace = True)


KeyError: "None of ['a'] are in the columns"

In [59]:
df1

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
115,222,343
323,242,356


In [66]:
df1.reset_index(drop = False, inplace = True)

In [62]:
df1.reset_index(drop = True)

Unnamed: 0,b,c
0,222,343
1,242,356


In [67]:
df1

Unnamed: 0,a,b,c
0,115,222,343
1,323,242,356


# group by function 

In [72]:
df1.set_index('a')

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
115,222,343
323,242,356


In [77]:

df1.loc[len(df1.index)] = [12,13,14]   #add a row

In [85]:
df1["name"] = ['ali','ahmad','reza']  #add a column
df1
df1.loc[len(df1.index)] = [11,11,11,'reza']

ValueError: Length of values (3) does not match length of index (4)

In [88]:
df1


Unnamed: 0,a,b,c,name
0,115,222,343,ali
1,323,242,356,ahmad
2,12,13,14,reza
3,12,13,14,reza


In [90]:
g = df1.groupby('name')
g.first()

Unnamed: 0_level_0,a,b,c
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ahmad,323,242,356
ali,115,222,343
reza,12,13,14


In [91]:
g.get_group('reza')

Unnamed: 0,a,b,c,name
2,12,13,14,reza
3,12,13,14,reza


# aggregate

In [92]:
df1.aggregate(['sum'])

Unnamed: 0,a,b,c,name
sum,462,490,727,aliahmadrezareza


In [93]:
df1.aggregate(['min'])

Unnamed: 0,a,b,c,name
min,12,13,14,ahmad


In [94]:
df1.aggregate(['max'])

Unnamed: 0,a,b,c,name
max,323,242,356,reza


# get items of series1 not present in series2

In [96]:
s1 = pd.Series([1,2,3,4,5,6,7])
s2 = pd.Series([3,5,8,9,0,10,11])

In [97]:
s1[~s1.isin(s2)] #items of s1 that not present in s2

0    1
1    2
3    4
5    6
6    7
dtype: int64

In [98]:
s1[s1.isin(s2)] #items of s1 that present in s2

2    3
4    5
dtype: int64

# apply, map, applymap

In [161]:
df1

Unnamed: 0_level_0,a,b,c
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ali,115,222,343
ahmad,323,242,356
reza,12,13,14
reza,12,13,14
mamad,14,11,17


In [159]:
df1.set_index('name', inplace = True)
#df1.reset_index('name', inplace = True)

In [157]:
df1.loc[len(df1.index)] = ["mamad",14,11,17]

In [160]:
df1.drop(4, axis = 0, inplace = True) 

In [171]:
df1['a'].sort_values()

name
reza      12
reza      12
mamad     14
ali      115
ahmad    323
Name: a, dtype: object

# MERGE VS. CONCAT

In [176]:
np.arange(1,4)

array([1, 2, 3])

In [180]:
df1 = pd.DataFrame({'keys':['a', 'b', 'c'], 'data1': np.arange(1,4)})
df2 = pd.DataFrame({'keys':['f', 'b', 'c'], 'data2': np.arange(4,7)})
df2

Unnamed: 0,keys,data2
0,f,4
1,b,5
2,c,6


In [181]:
pd.merge(df1,df2)

Unnamed: 0,keys,data1,data2
0,b,2,5
1,c,3,6


In [182]:
pd.concat([df1, df2])

Unnamed: 0,keys,data1,data2
0,a,1.0,
1,b,2.0,
2,c,3.0,
0,f,,4.0
1,b,,5.0
2,c,,6.0


In [184]:
df1["keys"]

0    a
1    b
2    c
Name: keys, dtype: object