# Series

In [25]:
import numpy as np


In [2]:
import pandas as pd


In [3]:
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10, 'b':20, 'c':30}

In [4]:
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [5]:
pd.Series(data = my_data, index = labels)

a    10
b    20
c    30
dtype: int64

In [6]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [7]:
# Provides various data structures and operations for manipulating numerical
# data and time series. This library is built on top of the NumPy library. 
#  Advantages 
# Fast and efficient for manipulating and analyzing data.
# Data from different file objects can be loaded.
# Easy handling of missing data (represented as NaN) in floating point as well
#   as non-floating point data
# Size mutability: columns can be inserted and deleted from DataFrame and 
#   higher dimensional objects
# Data set merging and joining.
# Flexible reshaping and pivoting of data sets
# Provides time-series functionality.
# Powerful group by functionality for performing split-apply-combine operations 
#   on data sets.

In [8]:
arr

array([10, 20, 30])

In [9]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int32

In [10]:
labels

['a', 'b', 'c']

In [11]:
pd.Series(data = labels)

0    a
1    b
2    c
dtype: object

In [12]:
pd.Series(data = [sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [14]:
ser1 = pd.Series([1,2,3,4],['СССР','Deutschland','USA','日本'])
ser1

СССР           1
Deutschland    2
USA            3
日本             4
dtype: int64

In [18]:
ser2 = pd.Series([1,2,5,4],['Italia','Deutschland','USA','日本'])
ser2

Italia         1
Deutschland    2
USA            5
日本             4
dtype: int64

In [22]:
ser1['СССР']

1

In [23]:
ser1 + ser2    # Integers here are going to be converted into floats

Deutschland    4.0
Italia         NaN
USA            8.0
СССР           NaN
日本             8.0
dtype: float64

In [24]:
# Pandas and Numpy will always convert stuff to float in order to retain all
# the information possible

# Data Frames
Looks like Excel

In [None]:
# Data Frame is just a bunch of series that share the same index

In [26]:
from numpy.random import randn  

In [27]:
np.random.seed(101)

In [28]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [29]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [30]:
df['W']   # Grabbing a sequel

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [31]:
type(df)

pandas.core.frame.DataFrame

In [32]:
type(df['W'])

pandas.core.series.Series

In [34]:
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [35]:
df['new'] = df['W'] + df['Y']

In [37]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [39]:
df.drop('new')    # It says 'new' not found on the axis

KeyError: "['new'] not found in axis"

In [40]:
df.drop('new', axis = 1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [41]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [42]:
df.drop('new', axis = 1, inplace = True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [43]:
df.drop('E')    # You don't need to define axis = 0

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [44]:
# Axis in ROWS are 0 (row wsie operation)
# Axis in COLUMNS are 1 (column wise operation)
# By default, axis = 0
# Reason goes back to Numpy; 
# As shown below, for 5 there are A,B,C,D,E and that will have index 0
# and for 4 there are W,X,Y,Z and that will have index 1

In [45]:
df.shape

(5, 4)

In [46]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [47]:
# HOW TO SELELCT A ROW
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64