# What is a Series?

In [1]:
import pandas as pd
import numpy as np

In [4]:
series = pd.Series(data=[1, 2, 3], index = ['one','two','three'])
series

one      1
two      2
three    3
dtype: int64

In [5]:
series = pd.Series(np.array([1, 2, 3]))
series

0    1
1    2
2    3
dtype: int64

In [7]:
val1 = [1,2,3]
val2 = [4,5,6]

In [8]:
series = pd.Series({'column1':val1, 'column2':val2})
series

column1    [1, 2, 3]
column2    [4, 5, 6]
dtype: object

## What is DataFrame?

In [18]:
import pandas as pd

d = {'ColumnA' : pd.Series([1, 2, 3]),
   'ColumnB' : pd.Series([4, 5, 6])}

df = pd.DataFrame(d)

In [11]:
df

Unnamed: 0,ColumnA,ColumnB
0,1,4
1,2,5
2,3,6


## Select Column

In [12]:
data = df['ColumnA']
data

0    1
1    2
2    3
Name: ColumnA, dtype: int64

## Select Row

```python
data = df.loc['label']
data = df.loc[['label1', 'label2']]
```

In [14]:
data = df.iloc[0]

## Reset Index
```python
df.reset_index()
```

## Drop
```python
df.drop([label1, label2])
df.drop(df.index[1])
df.drop(df[df.ColumnA != 'Kang'])
# Drop Column
df.drop('column name', axis=1)
```

## Add column
```python
dataFrame['newColumn'] = pd.Series(...)
```

## Rename columns
```python
df.rename(columns, index, inplace = True)
```

## Reindex
```python
df.reindex(index=[array], columns=[columns])
```

## Cutting Into Intervals
```python
pandas.cut(dataframe, number of bins)
```

## Iterating Over DataFrame Columns
```python
for column in dataFrame:
  print(column)

for column,items in dataFrame.iteritems():

for index_of_row, row in dataFrame.iterrows():

for row in dataFrame.itertuples():
```

## Sort
```python
# Sort rows
df = df.sort_index()
# Sort Columns
df = df .sort_values(by='ColumnA')
```

# What is a Panel?

In [20]:
# import pandas as pd
# import numpy as np

# data = {'FirstDataFrame' : pd.DataFrame(d), 
#    'SecondDataFrame' : pd.DataFrame(d)}
# p = pd.Panel(data)
# p

# Pandas Function
```python
# Read csv file
df=pd.read_csv("mycsv.csv",index_col=['ColumnA'])
# Read Excel file
pd.read_excel('myExcel.xlsx', index_col=['ColumnA'])
# Read one sheet in Excel file
pd.read_excel(open('myExcel.xlsx', 'rb'), sheet_name='Sheet1')
# Use Head
r = df.head(10)
# Use Tail
r = df.tail(10)
# Transpose
df.T
# Shape
df.shape
# Size
df.size
# ndim
df.ndim
# Describe
df.describe()
# Absolute values
df.abs()
# Mean values
df.mean() # df.median(), df.mode()
# Statistics
df.count()
df.std()
df.cumsum()
df.prod()
```

## Apply function in Row/Column

In [23]:
df = pd.DataFrame({'EmailBody':['sender said Hello!', 'sender needs length space', 'sender needs length'],
                   'EmailSender': ['Kang@medium.com', 'Branden@medium.com', 'Sangyun@medium.com']})


def clean_body(data):
    body = data['EmailBody']
    sender = data['EmailSender']
    body = body.replace('sender', sender).replace('length', str(len(sender)))
    return body

df['EmailBody'] = df.apply(clean_body, axis=1)

In [24]:
df

Unnamed: 0,EmailBody,EmailSender
0,Kang@medium.com said Hello!,Kang@medium.com
1,Branden@medium.com needs 18 space,Branden@medium.com
2,Sangyun@medium.com needs 18,Sangyun@medium.com


## Feature Engineering in Pandas
```python
# check missing values
df.notnull()
# drop missing values
df.dropna()
# fill missing values
df.fillna(value)
# fill missing values - backward or forward
df.fillna(method='ffill') # backfill for backward
# compare elements in percentage
df.pct_change()
# calculate standard deviation
df.std()
# calculate covariance
df.cov()
# calculate correlation
df.corr()
# calculate rolling moving average with window
df.rolling(window=N).median()
# calculate expanding and exponentially weighted average
df.ewm(com=0.5).median()
# aggregate columns
df.agg({'Column1':['sum','min], 'Column2':['count']})
# group rows
groupdf = df.groupby('ColumnName')
# filter
df.filter()
# Query
df = df[(df['ColumnA'].isin([1,2] & (df['ColumnB'] == 'A')]
# Merge
merged = pd.merge(left,right,left_on='left_column',right_on='right_column', how='left')
# Sort
df.sort_values(by=['col1', 'col2'], ascending=False)
# Union
pd.concat([one, two])
# Compute dates
pd.date_range(start, end)
# Plot dataFrame
df.plot.bar()
df.diff.hist(bins=10)
df.plot.scatter()
```