In [3]:
import numpy as np
import pandas as pd

# Series

In [4]:
labels = ['a','b','c']

In [6]:
data = [10,20,30]

In [7]:
arr = np.array(data)

In [8]:
d = {'a':10, 'b':20, 'c':30}

In [9]:
pd.Series(data = data)

0    10
1    20
2    30
dtype: int64

In [10]:
pd.Series(data=data, index=labels)

a    10
b    20
c    30
dtype: int64

In [11]:
pd.Series(data, labels)

a    10
b    20
c    30
dtype: int64

In [12]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [13]:
pd.Series(arr, labels)

a    10
b    20
c    30
dtype: int32

In [14]:
ser1 = pd.Series([1,2,3,4],['usa', 'germany', 'fr', 'japan'])

In [18]:
ser2 = pd.Series([1,2,5,4],['usa', 'germany', 'italy', 'japan'])

In [16]:
ser1

usa        1
germany    2
italy      5
japan      4
dtype: int64

In [19]:
ser2

usa        1
germany    2
italy      5
japan      4
dtype: int64

In [23]:
ser1['usa']

1

# Data frames

In [32]:
# setting the seed whcih we use to generate numbers
np.random.seed(101)
# to use a function with out calling the full path
from numpy.random import randn

In [34]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['w','x','y','z'])

In [35]:
df

Unnamed: 0,w,x,y,z
A,0.190794,1.978757,2.605967,0.683509
B,0.302665,1.693723,-1.706086,-1.159119
C,-0.134841,0.390528,0.166905,0.184502
D,0.807706,0.07296,0.638787,0.329646
E,-0.497104,-0.75407,-0.943406,0.484752


In [36]:
df['w']

A    0.190794
B    0.302665
C   -0.134841
D    0.807706
E   -0.497104
Name: w, dtype: float64

In [38]:
df.w

A    0.190794
B    0.302665
C   -0.134841
D    0.807706
E   -0.497104
Name: w, dtype: float64

In [40]:
df[['w','z']]

Unnamed: 0,w,z
A,0.190794,0.683509
B,0.302665,-1.159119
C,-0.134841,0.184502
D,0.807706,0.329646
E,-0.497104,0.484752


In [41]:
df['new'] = df['w'] + df['z']

In [42]:
df

Unnamed: 0,w,x,y,z,new
A,0.190794,1.978757,2.605967,0.683509,0.874303
B,0.302665,1.693723,-1.706086,-1.159119,-0.856454
C,-0.134841,0.390528,0.166905,0.184502,0.049661
D,0.807706,0.07296,0.638787,0.329646,1.137352
E,-0.497104,-0.75407,-0.943406,0.484752,-0.012352


In [45]:
# deleting a column or a row with the fubction drop, but we need to specify the typ(column/row)
df.drop(labels='z', axis=1)

Unnamed: 0,w,x,y,new
A,0.190794,1.978757,2.605967,0.874303
B,0.302665,1.693723,-1.706086,-0.856454
C,-0.134841,0.390528,0.166905,0.049661
D,0.807706,0.07296,0.638787,1.137352
E,-0.497104,-0.75407,-0.943406,-0.012352


In [46]:
df.drop(labels='D', axis=0)

Unnamed: 0,w,x,y,z,new
A,0.190794,1.978757,2.605967,0.683509,0.874303
B,0.302665,1.693723,-1.706086,-1.159119,-0.856454
C,-0.134841,0.390528,0.166905,0.184502,0.049661
E,-0.497104,-0.75407,-0.943406,0.484752,-0.012352


In [47]:
# for the deletion to take place in the dataframe, we need to set the attribute "inplace" to true
df.drop(labels='D', axis=0, inplace=True)

In [48]:
df

Unnamed: 0,w,x,y,z,new
A,0.190794,1.978757,2.605967,0.683509,0.874303
B,0.302665,1.693723,-1.706086,-1.159119,-0.856454
C,-0.134841,0.390528,0.166905,0.184502,0.049661
E,-0.497104,-0.75407,-0.943406,0.484752,-0.012352


In [49]:
# to select a row we use the "loc" method or the "iloc" thod
df.loc['A']

w      0.190794
x      1.978757
y      2.605967
z      0.683509
new    0.874303
Name: A, dtype: float64

In [50]:
df.loc[['A','B']]

Unnamed: 0,w,x,y,z,new
A,0.190794,1.978757,2.605967,0.683509,0.874303
B,0.302665,1.693723,-1.706086,-1.159119,-0.856454


In [51]:
df.iloc[0]

w      0.190794
x      1.978757
y      2.605967
z      0.683509
new    0.874303
Name: A, dtype: float64

In [55]:
df.iloc[[0,1]]

A    1.978757
B    1.693723
Name: x, dtype: float64

In [58]:
df.iloc[[0,1],1]

A    1.978757
B    1.693723
Name: x, dtype: float64

In [59]:
df.loc['A','new']

0.8743032079106301

In [62]:
# show A and B with columns x and z
df.loc[['A','B'],['x','z']]

Unnamed: 0,x,z
A,1.978757,0.683509
B,1.693723,-1.159119


## Conditional selection

In [65]:
df<1

Unnamed: 0,w,x,y,z,new
A,True,False,False,True,True
B,True,False,True,True,True
C,True,True,True,True,True
E,True,True,True,True,True


In [69]:
df[df>0]

Unnamed: 0,w,x,y,z,new
A,0.190794,1.978757,2.605967,0.683509,0.874303
B,0.302665,1.693723,,,
C,,0.390528,0.166905,0.184502,0.049661
E,,,,0.484752,


In [70]:
df['w']>0

A     True
B     True
C    False
E    False
Name: w, dtype: bool

In [72]:
# selection only the example that have w>0
df[df['w']>0]

Unnamed: 0,w,x,y,z,new
A,0.190794,1.978757,2.605967,0.683509,0.874303
B,0.302665,1.693723,-1.706086,-1.159119,-0.856454


In [78]:
# displaying only the rows with z<0
df[df['z']<0]

Unnamed: 0,w,x,y,z,new
B,0.302665,1.693723,-1.706086,-1.159119,-0.856454


In [80]:
# display the value of y of the columns that have z<0
df[df['z']<0]['y']

B   -1.706086
Name: y, dtype: float64

## multiple condition selection

In [84]:
df[(df['x']>0) & (df['z']<0)]

Unnamed: 0,w,x,y,z,new
B,0.302665,1.693723,-1.706086,-1.159119,-0.856454


## missing values

In [85]:
d = {'a':[1,2,np.nan],'b':[1,np.nan,np.nan],'c':[1,2,3]}

In [92]:
df2 = pd.DataFrame(index=['x','y','z'],data=d)

In [93]:
df2.dropna()

Unnamed: 0,a,b,c
x,1.0,1.0,1


In [94]:
df2.dropna(axis=1)

Unnamed: 0,c
x,1
y,2
z,3


In [97]:
df2.dropna(thresh=2)

Unnamed: 0,a,b,c
x,1.0,1.0,1
y,2.0,,2


In [98]:
df2.dropna(thresh=2, axis=1)

Unnamed: 0,a,c
x,1.0,1
y,2.0,2
z,,3


In [99]:
df2.fillna(value=-1)

Unnamed: 0,a,b,c
x,1.0,1.0,1
y,2.0,-1.0,2
z,-1.0,-1.0,3


In [100]:
df2.loc['y'].fillna(value=df2.loc['y'].mean())

a    2.0
b    2.0
c    2.0
Name: y, dtype: float64

## Reading files

In [101]:
df = pd.read_csv('example')
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [102]:
df.to_csv('example',index=False)

In [105]:
df.to_excel('Excel_Sample.xlsx',sheet_name='Sheet1')