<div align="center">
    <h4 style="font-size: 50;">Pandas</h4>
</div>

* Load csv, xlsx files
* Drop, concatenate, groupby
* Data structures - Series and DataFrame, so you will learn how tu create Series and DataFrame 

In [1]:
# import packages
import numpy as np
import pandas as pd

#### create a Series with index

In [3]:
data = pd.Series([0.25,0.5,0.75,1.0])

In [4]:
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

#### create a Series with explicit index

In [5]:
data = pd.Series([0.25,0.5,0.75,1.0],
                index = ['a','b','c','d'])

In [6]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

#### access data from the series

In [7]:
data['b']

0.5

#### Construct a DataFrame from a dictionary

In [8]:
dictionary={"Name" :["Alex","Bob","Cathy","Don","Emma","Frank"],
            "Age" :[15,40,30,60,19,25],
            "income" : [300,800,100,850,100,250]}

In [9]:
df = pd.DataFrame(dictionary) #DataFrame

In [10]:
df

Unnamed: 0,Name,Age,income
0,Alex,15,300
1,Bob,40,800
2,Cathy,30,100
3,Don,60,850
4,Emma,19,100
5,Frank,25,250


In [11]:
# check the first few rows of the DataFrame (default = the first 5 rows)
df.head()

Unnamed: 0,Name,Age,income
0,Alex,15,300
1,Bob,40,800
2,Cathy,30,100
3,Don,60,850
4,Emma,19,100


In [12]:
# check the first 2 rows of the DataFrame
df.head(2)

Unnamed: 0,Name,Age,income
0,Alex,15,300
1,Bob,40,800


In [13]:
# check the last few rows of the DataFrame (default = the first 5 rows)
df.tail()

Unnamed: 0,Name,Age,income
1,Bob,40,800
2,Cathy,30,100
3,Don,60,850
4,Emma,19,100
5,Frank,25,250


In [14]:
# check the columns of the DataFrame 
df.columns

Index(['Name', 'Age', 'income'], dtype='object')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    6 non-null      object
 1   Age     6 non-null      int64 
 2   income  6 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 272.0+ bytes


In [16]:
df.dtypes

Name      object
Age        int64
income     int64
dtype: object

In [17]:
df.describe()

Unnamed: 0,Age,income
count,6.0,6.0
mean,31.5,400.0
std,16.477257,339.116499
min,15.0,100.0
25%,20.5,137.5
50%,27.5,275.0
75%,37.5,675.0
max,60.0,850.0


In [18]:
# print column Name
df["Name"]

0     Alex
1      Bob
2    Cathy
3      Don
4     Emma
5    Frank
Name: Name, dtype: object

In [19]:
# alternative way to print column Name 
df.Name

0     Alex
1      Bob
2    Cathy
3      Don
4     Emma
5    Frank
Name: Name, dtype: object

In [20]:
# add a new column
df["new_feature"]=[-1,-2,-3,-4,-5,-6]

In [21]:
df

Unnamed: 0,Name,Age,income,new_feature
0,Alex,15,300,-1
1,Bob,40,800,-2
2,Cathy,30,100,-3
3,Don,60,850,-4
4,Emma,19,100,-5
5,Frank,25,250,-6


In [22]:
# print column Name
df.loc[:,"Name"]

0     Alex
1      Bob
2    Cathy
3      Don
4     Emma
5    Frank
Name: Name, dtype: object

In [23]:
# print column from Name to income, rows from 2nd to 4th
df.loc[1:3,"Name":"income"]

Unnamed: 0,Name,Age,income
1,Bob,40,800
2,Cathy,30,100
3,Don,60,850


In [24]:
# reverse all the rows
df.loc[::-1,:]

Unnamed: 0,Name,Age,income,new_feature
5,Frank,25,250,-6
4,Emma,19,100,-5
3,Don,60,850,-4
2,Cathy,30,100,-3
1,Bob,40,800,-2
0,Alex,15,300,-1


In [25]:
# print columns before column Age
df.loc[:,:"Age"]

Unnamed: 0,Name,Age
0,Alex,15
1,Bob,40
2,Cathy,30
3,Don,60
4,Emma,19
5,Frank,25


In [26]:
# print columns before 2nd column
df.iloc[:,:2]

Unnamed: 0,Name,Age
0,Alex,15
1,Bob,40
2,Cathy,30
3,Don,60
4,Emma,19
5,Frank,25


#### filtering

In [27]:
#check all the values in the column income, whether the value is larger than 500 or not.
df.income>500

0    False
1     True
2    False
3     True
4    False
5    False
Name: income, dtype: bool

In [28]:
# find all the values whose income is larger than 500
df[df.income>500]

Unnamed: 0,Name,Age,income,new_feature
1,Bob,40,800,-2
3,Don,60,850,-4


In [29]:
# check all the values in the column Age, whether the value is less than 35 or not.
df.Age<35

0     True
1    False
2     True
3    False
4     True
5     True
Name: Age, dtype: bool

In [30]:
# find all the values whose age is less than 35
df[df.Age<35]

Unnamed: 0,Name,Age,income,new_feature
0,Alex,15,300,-1
2,Cathy,30,100,-3
4,Emma,19,100,-5
5,Frank,25,250,-6


In [34]:
# find all the values whose income is larger than 500 and age is less than 35
df[(df.income>500) & (df.Age<41)]

Unnamed: 0,Name,Age,income,new_feature
1,Bob,40,800,-2


#### Simple Calculation

In [35]:
df.income.mean()

400.0

In [36]:
np.mean(df.income)

400.0

In [37]:
df.income.sum()

2400

In [38]:
df.income.std()

339.1164991562634

In [39]:
df.income.median()

275.0

#### Drop

In [40]:
df # we added a new feature, and now we want to drop it

Unnamed: 0,Name,Age,income,new_feature
0,Alex,15,300,-1
1,Bob,40,800,-2
2,Cathy,30,100,-3
3,Don,60,850,-4
4,Emma,19,100,-5
5,Frank,25,250,-6


In [41]:
# drop the "new_feature" column
df = df.drop(["new_feature"],axis=1)

In [42]:
df

Unnamed: 0,Name,Age,income
0,Alex,15,300
1,Bob,40,800
2,Cathy,30,100
3,Don,60,850
4,Emma,19,100
5,Frank,25,250


#### Concatenating two tables vertically

In [43]:
# make a new dataframe from original dataframe df
data1=df.head()

In [45]:
data1

Unnamed: 0,Name,Age,income
0,Alex,15,300
1,Bob,40,800
2,Cathy,30,100
3,Don,60,850
4,Emma,19,100


In [50]:
#male another new dataframe from original dataframe df
data2=df.tail()

In [51]:
data2

Unnamed: 0,Name,Age,income
1,Bob,40,800
2,Cathy,30,100
3,Don,60,850
4,Emma,19,100
5,Frank,25,250


In [52]:
# vertical concatenating
data_concat=pd.concat([data1,data2], axis=0)

In [53]:
data_concat

Unnamed: 0,Name,Age,income
0,Alex,15,300
1,Bob,40,800
2,Cathy,30,100
3,Don,60,850
4,Emma,19,100
1,Bob,40,800
2,Cathy,30,100
3,Don,60,850
4,Emma,19,100
5,Frank,25,250


#### Concatenating two tables horizontally

In [55]:
# make a new series from original dataframe df
Income = df.income

In [56]:
Income

0    300
1    800
2    100
3    850
4    100
5    250
Name: income, dtype: int64

In [57]:
#make another new series from original dataframe df
age=df.Age

In [58]:
age

0    15
1    40
2    30
3    60
4    19
5    25
Name: Age, dtype: int64

In [59]:
# horizontal concatenating
data_h_concat = pd.concat([Income,age], axis=1)

In [60]:
data_h_concat

Unnamed: 0,income,Age
0,300,15
1,800,40
2,100,30
3,850,60
4,100,19
5,250,25


#### groupby

In [62]:
df = pd.DataFrame({'Object': ['A','A','A','B','B'], 'Speed': [380.,370.,321.,24.,26.]})

In [63]:
df

Unnamed: 0,Object,Speed
0,A,380.0
1,A,370.0
2,A,321.0
3,B,24.0
4,B,26.0


In [64]:
df.groupby(['Object']).mean()

Unnamed: 0_level_0,Speed
Object,Unnamed: 1_level_1
A,357.0
B,25.0


In [65]:
df.groupby(['Object']).size()

Object
A    3
B    2
dtype: int64

In [66]:
df.groupby(['Object']).size().reset_index(name='count')

Unnamed: 0,Object,count
0,A,3
1,B,2


#### Construct a DataFrame from numpy ndarray

In [67]:
df = pd.DataFrame(np.array([[1,2,3],[4,5,6]]),
                  columns = ['a','b','c'])

In [68]:
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


#### load files

In [72]:
# load data from excel
import openpyxl

In [73]:
df_xlsx = pd.read_excel(r'./data/data.xlsx')

In [74]:
df_xlsx

Unnamed: 0,x1,x2,y
0,-1,2,0
1,3,3,0
2,1,4,0
3,2,7,0
4,5,6,0
5,7,2,1
6,9,1,1
7,8,5,1
8,9,7,1
9,10,3,1


In [69]:
# load data from csv file
df_csv = pd.read_csv(r'./data/data.csv')

In [70]:
df_csv

Unnamed: 0,x1,x2,y
0,-1,2,0
1,3,3,0
2,1,4,0
3,2,7,0
4,5,6,0
5,7,2,1
6,9,1,1
7,8,5,1
8,9,7,1
9,10,3,1


#### Summary

1. Learn how to create Series and DataFrame
2. Learn how to drop, vertically and horizontally concatenate, groupby
3. Learn how to load csv, xlsx files
4. Reference; https://pandas.pydata.org