# Python for Data Science
- Pandas in an open source library built on top of Numpy
- It allows for fast analysis and data cleaning and preparation
- It excels in performance and productivity
- It also has built-in visualization features
- It can work with data from a wide variety of sources

+ Serices
+ DataFrames
+ Missing Data
+ GroupBy
+ Merging, Joining, and Concatenating
+ Operations
+ Data Input and ouput

In [10]:
import numpy as np
import pandas as pd


In [11]:
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10,'b':20, 'c':30}

In [12]:
pd.Series(data=my_data)

0    10
1    20
2    30
dtype: int64

In [13]:
pd.Series(data=my_data,index=labels)

a    10
b    20
c    30
dtype: int64

In [14]:
pd.Series(my_data,labels)

a    10
b    20
c    30
dtype: int64

In [15]:
pd.Series(arr,labels)

a    10
b    20
c    30
dtype: int32

In [16]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [18]:
pd.Series(data=[sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [23]:
ser1 = pd.Series([1,2,3,4],['USA','Germany','USSR','Japan'])
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [28]:
ser2 = pd.Series([1,3,5,4],['USA','Germany','Italy','Japan'])
ser2

USA        1
Germany    3
Italy      5
Japan      4
dtype: int64

In [25]:
ser1['USA']

1

In [26]:
ser3 = pd.Series(data=labels)
ser3[0]

'a'

In [29]:
ser1 + ser2

Germany    5.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

# Pandas - DataFrames

In [31]:
from numpy.random import randn
np.random.seed(101)

In [35]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [36]:
df['W']

A   -0.993263
B    1.025984
C    2.154846
D    0.147027
E   -0.925874
Name: W, dtype: float64

In [41]:
print(type(df),type(df['W']))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [44]:
df[['W','Z']]

Unnamed: 0,W,Z
A,-0.993263,0.000366
B,1.025984,0.649826
C,2.154846,-0.346419
D,0.147027,1.02481
E,-0.925874,0.610478


In [45]:
df['new'] = df['W']+df['Y'] # add 
df 

Unnamed: 0,W,X,Y,Z,new
A,-0.993263,0.1968,-1.136645,0.000366,-2.129908
B,1.025984,-0.156598,-0.031579,0.649826,0.994405
C,2.154846,-0.610259,-0.755325,-0.346419,1.399521
D,0.147027,-0.479448,0.558769,1.02481,0.705796
E,-0.925874,1.862864,-1.133817,0.610478,-2.059691


In [50]:
df.drop('new',axis=1,inplace=True) # axis = 0 , if implace true df clear column 'new' 

In [51]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [53]:
df.drop('E',axis=0)

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481


In [55]:
df.shape

(5, 4)

In [56]:
#ROWS
df.loc['A']

W   -0.993263
X    0.196800
Y   -1.136645
Z    0.000366
Name: A, dtype: float64

In [58]:
df.iloc[2] # i : index

W    2.154846
X   -0.610259
Y   -0.755325
Z   -0.346419
Name: C, dtype: float64

In [59]:
df.loc['B','Y']

-0.031579143908112575

In [60]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,-0.993263,-1.136645
B,1.025984,-0.031579


In [62]:
booldf=df > 0
booldf

Unnamed: 0,W,X,Y,Z
A,False,True,False,True
B,True,False,False,True
C,True,False,False,False
D,True,False,True,True
E,False,True,False,True


In [63]:
df[booldf] #true => print
#same df[df>0]

Unnamed: 0,W,X,Y,Z
A,,0.1968,,0.000366
B,1.025984,,,0.649826
C,2.154846,,,
D,0.147027,,0.558769,1.02481
E,,1.862864,,0.610478


In [64]:
df['W'] > 0

A    False
B     True
C     True
D     True
E    False
Name: W, dtype: bool

In [65]:
df[df['W'] > 0] # series == true

Unnamed: 0,W,X,Y,Z
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481


In [66]:
resultdf = df[df['W']>0]
resultdf['X']

B   -0.156598
C   -0.610259
D   -0.479448
Name: X, dtype: float64

In [68]:
df[df['W']>0][['X','Y']]
#same
"""
boolser = df['W'] > 0
result = df[boolser]
mycols = ['X','Y']
result[mycols]
"""

Unnamed: 0,X,Y
B,-0.156598,-0.031579
C,-0.610259,-0.755325
D,-0.479448,0.558769


In [74]:
df[(df['W']>0) & (df['Y']> 0)] # & == and, | == or

Unnamed: 0,W,X,Y,Z
D,0.147027,-0.479448,0.558769,1.02481


In [75]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,-0.993263,0.1968,-1.136645,0.000366
1,B,1.025984,-0.156598,-0.031579,0.649826
2,C,2.154846,-0.610259,-0.755325,-0.346419
3,D,0.147027,-0.479448,0.558769,1.02481
4,E,-0.925874,1.862864,-1.133817,0.610478


In [77]:
newind = 'CA NY WY OR CO'.split()
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [78]:
df['States'] = newind
df

Unnamed: 0,W,X,Y,Z,States
A,-0.993263,0.1968,-1.136645,0.000366,CA
B,1.025984,-0.156598,-0.031579,0.649826,NY
C,2.154846,-0.610259,-0.755325,-0.346419,WY
D,0.147027,-0.479448,0.558769,1.02481,OR
E,-0.925874,1.862864,-1.133817,0.610478,CO


In [79]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,-0.993263,0.1968,-1.136645,0.000366
NY,1.025984,-0.156598,-0.031579,0.649826
WY,2.154846,-0.610259,-0.755325,-0.346419
OR,0.147027,-0.479448,0.558769,1.02481
CO,-0.925874,1.862864,-1.133817,0.610478


In [80]:
df

Unnamed: 0,W,X,Y,Z,States
A,-0.993263,0.1968,-1.136645,0.000366,CA
B,1.025984,-0.156598,-0.031579,0.649826,NY
C,2.154846,-0.610259,-0.755325,-0.346419,WY
D,0.147027,-0.479448,0.558769,1.02481,OR
E,-0.925874,1.862864,-1.133817,0.610478,CO


# DataFrames Part 2

In [83]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [85]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-1.38292,1.482495
G1,2,0.961458,-2.141212
G1,3,0.992573,1.192241
G2,1,-1.04678,1.292765
G2,2,-1.467514,-0.494095
G2,3,-0.162535,0.485809


In [87]:
df.loc['G1'].loc[1]

A   -1.382920
B    1.482495
Name: 1, dtype: float64

In [90]:
df.index.names = ['Groups','Num']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-1.38292,1.482495
G1,2,0.961458,-2.141212
G1,3,0.992573,1.192241
G2,1,-1.04678,1.292765
G2,2,-1.467514,-0.494095
G2,3,-0.162535,0.485809


In [92]:
df.loc['G2'].loc[2]['B']

-0.49409535833912277

In [94]:
df.xs(1,level='Num') 

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-1.38292,1.482495
G2,-1.04678,1.292765
