In [1]:
######################################################################################################
#  reference : 
#     https://towardsdatascience.com/a-quick-introduction-to-the-pandas-python-library-f1b678f34673
#     https://pandas.pydata.org/pandas-docs/stable/10min.html
######################################################################################################

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# List indexed by integers
larsis_list = ["Shy Banana", "Shy Mango", "Shy Kiwi"]
larsis_series = pd.Series(larsis_list)

# indexing
larsis_series[0]

# Create dataframe
dates = pd.date_range('20130101', periods = 6)
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list('ABCD'))

# Change column names
df.rename(columns = {'A' : 'geiles', 'B' : 'Geld'})

# viewing data
df.head()
df.tail()
df.index
df.columns

# Summary of data
df.describe()

# transposing
df.T

# Setting values
df.at[dates[0],'A'] = 0 # by label
df.iat[0,1] = 1 # by position
df.loc[:,'D'] = np.array([5] * len(df)) # multiple values with numpy arrray
df[df < 0] = -df # set values by condition (change sign of negative values)

# sorting values by column
df.sort_values(by = 'B')

# selecting columns
df['A']
df[['A','B']]

# selecting rows
df[0:2]
df['20130101':'20130102']

# advanced selecting
df.loc['20130102', ['A','B']]
df.iloc[0,:] #iloc refers to index
df.iloc[0,0]
df.iloc[0:2,1:3]

# (Boolean) Filtering
df[df.A > 0.2]

# Advanced filtering with isin() method (returns rows in which a specified column has specified values)
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
df2[df2['E'].isin(['two','four'])]


# Read data from csv
obst = pd.read_csv('fruits.csv')
new_obst = pd.read_csv('new_fruits.csv')
even_more_obst = pd.read_csv('even_more_fruits.csv')


# Check dimensions etc.
obst.info()

# Basic summary statistics
obst.mean()
obst.corr()
obst.count() # counts non-null values per column
obst.min()
obst.max()
obst.median()
obst.std()

# Check for Nan / get rid of NaN / replace NaN
obst.isnull() # returns true for Null values
obst.dropna()
obst.fillna('POPO')

# join dataframes
more_obst = pd.concat([obst, new_obst], axis=1) # puts dataframes next to each other (rows must be equal)
more_obst = more_obst.append(even_more_obst) # puts dataframes below each other (columns must be equal)


ModuleNotFoundError: No module named 'matplotlib'

In [8]:
######################################################################################################
#     https://towardsdatascience.com/a-quick-introduction-to-the-pandas-python-library-f1b678f34673
#     https://pandas.pydata.org/pandas-docs/stable/10min.html
######################################################################################################

import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt


# List indexed by integers
larsis_list = ["Shy Banana", "Shy Mango", "Shy Kiwi"]
larsis_list

['Shy Banana', 'Shy Mango', 'Shy Kiwi']

In [7]:
larsis_series = pd.Series(larsis_list)
larsis_series

0    Shy Banana
1     Shy Mango
2      Shy Kiwi
dtype: object

In [9]:
# indexing
larsis_series[0]

'Shy Banana'

In [11]:
# Create dataframe
dates = pd.date_range('20130101', periods = 6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.365437,-1.617825,0.123522,-1.281333
2013-01-02,0.836601,0.335053,-0.621821,-0.035748
2013-01-03,1.139167,-0.668782,-0.700436,-1.325677
2013-01-04,-1.60757,0.034954,-0.701009,-0.796876
2013-01-05,2.455484,0.568284,-0.058485,0.609774
2013-01-06,0.005873,-0.694772,-1.306644,0.147801


In [16]:
# Change column names
df2 = df.rename(columns = {'A' : 'geiles', 'B' : 'Geld'})
df2

Unnamed: 0,geiles,Geld,C,D
2013-01-01,-0.365437,-1.617825,0.123522,-1.281333
2013-01-02,0.836601,0.335053,-0.621821,-0.035748
2013-01-03,1.139167,-0.668782,-0.700436,-1.325677
2013-01-04,-1.60757,0.034954,-0.701009,-0.796876
2013-01-05,2.455484,0.568284,-0.058485,0.609774
2013-01-06,0.005873,-0.694772,-1.306644,0.147801


In [19]:
# Setting values
df.at[dates[0],'A'] = 3 # by label
df

Unnamed: 0,A,B,C,D
2013-01-01,3.0,-1.617825,0.123522,-1.281333
2013-01-02,0.836601,0.335053,-0.621821,-0.035748
2013-01-03,1.139167,-0.668782,-0.700436,-1.325677
2013-01-04,-1.60757,0.034954,-0.701009,-0.796876
2013-01-05,2.455484,0.568284,-0.058485,0.609774
2013-01-06,0.005873,-0.694772,-1.306644,0.147801


In [21]:
df.iat[0,1] = 1 # by position
df

Unnamed: 0,A,B,C,D
2013-01-01,3.0,1.0,0.123522,-1.281333
2013-01-02,0.836601,0.335053,-0.621821,-0.035748
2013-01-03,1.139167,-0.668782,-0.700436,-1.325677
2013-01-04,-1.60757,0.034954,-0.701009,-0.796876
2013-01-05,2.455484,0.568284,-0.058485,0.609774
2013-01-06,0.005873,-0.694772,-1.306644,0.147801


In [23]:
#column 전체 바꾸기
df.loc[:,'D'] = np.array([2] * len(df)) # multiple values with numpy arrray
df

Unnamed: 0,A,B,C,D
2013-01-01,3.0,1.0,0.123522,2
2013-01-02,0.836601,0.335053,-0.621821,2
2013-01-03,1.139167,-0.668782,-0.700436,2
2013-01-04,-1.60757,0.034954,-0.701009,2
2013-01-05,2.455484,0.568284,-0.058485,2
2013-01-06,0.005873,-0.694772,-1.306644,2


In [24]:
df[df < 0] = -df # set values by condition (change sign of negative values)
df

Unnamed: 0,A,B,C,D
2013-01-01,3.0,1.0,0.123522,2
2013-01-02,0.836601,0.335053,0.621821,2
2013-01-03,1.139167,0.668782,0.700436,2
2013-01-04,1.60757,0.034954,0.701009,2
2013-01-05,2.455484,0.568284,0.058485,2
2013-01-06,0.005873,0.694772,1.306644,2


In [27]:
df[df < 5] = df+2 # set values by condition (change sign of negative values)
df

Unnamed: 0,A,B,C,D
2013-01-01,5.0,5.0,4.123522,6
2013-01-02,4.836601,4.335053,4.621821,6
2013-01-03,5.139167,4.668782,4.700436,6
2013-01-04,5.60757,4.034954,4.701009,6
2013-01-05,6.455484,4.568284,4.058485,6
2013-01-06,4.005873,4.694772,5.306644,6


In [33]:
# sorting values by column
df.sort_values(by = 'B')

# selecting columns
df[['A']]
df[['A','B']]

Unnamed: 0,A,B
2013-01-01,5.0,5.0
2013-01-02,4.836601,4.335053
2013-01-03,5.139167,4.668782
2013-01-04,5.60757,4.034954
2013-01-05,6.455484,4.568284
2013-01-06,4.005873,4.694772


In [35]:

# selecting rows
df[0:2]
df['20130101':'20130102']


Unnamed: 0,A,B,C,D
2013-01-01,5.0,5.0,4.123522,6
2013-01-02,4.836601,4.335053,4.621821,6


In [39]:
# (Boolean) Filtering
df[df.A > 5]

Unnamed: 0,A,B,C,D
2013-01-03,5.139167,4.668782,4.700436,6
2013-01-04,5.60757,4.034954,4.701009,6
2013-01-05,6.455484,4.568284,4.058485,6


In [42]:

# Advanced filtering with isin() method (returns rows in which a specified column has specified values)
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']

df2[df2['E'].isin(['two','four'])]


Unnamed: 0,A,B,C,D,E
2013-01-03,5.139167,4.668782,4.700436,6,two
2013-01-05,6.455484,4.568284,4.058485,6,four
