In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Numpy and Pandas (more advanced)

* Numpy quickstart: https://numpy.org/doc/stable/user/quickstart.html
* NumPy Reference: https://numpy.org/doc/stable/reference/
* Pandas Getting started tutorials: https://pandas.pydata.org/docs/getting_started/intro_tutorials/index.html

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

In [None]:
array = [1, 2, 3]
array * 3

In [None]:
np_array = np.array([1, 2, 3])
np_array * 3

In [None]:
x = np.arange(20).reshape(4, 5) # you can try mutiple different dimensions
x

In [None]:
x.shape

In [None]:
x.ndim

In [None]:
# calculating the sum by individual dimensions
# other aggregation functions also work
# no need to define axis
x.sum(axis=1)

## Multiple types of numbers

In [None]:
x.dtype

In [None]:
a = np.array([.1, .2])
print(a)
a.dtype

In [None]:
c = np.array([[1, 2], [3, 4] ], dtype=complex)
print(c)
c.dtype

## Different ways to make an array

In [None]:
np.array([1, 2, 3])

In [None]:
np.zeros((3, 4))

In [None]:
np.ones((2, 5))

In [None]:
np.repeat(3, 10).reshape([2, 5])

In [None]:
np.linspace(0, 2, 9)

In [None]:
x = np.linspace( 0, 2*np.pi, 100 )
f = np.sin(x)
f

In [None]:
plt.plot(f)

## Matrix operations

In [None]:
A = np.array( [[1, 1], [0, 1]] )
B = np.array( [[2, 0], [3, 4]] )

In [None]:
A

In [None]:
B

In [None]:
np.transpose(B)

In [None]:
A*B

In [None]:
# actual matrix multiplication
A.dot(B) # np.dot(A, B)

## Selecting elements

In [None]:
a = np.arange(10)**3
a

In [None]:
a[2]

In [None]:
a[2:5]

In [None]:
a[:6:2]

In [None]:
# element selection can also be used to edit a field
a[:6:2] = -1000
a

In [None]:
a[ 9: 3:-1]

## Selecting elements from a multidimensional array

In [None]:
b = np.arange(20).reshape(4, 5)
b

In [None]:
# dimensions are separated by a comma
b[2, 3]

In [None]:
b[2,]

In [None]:
b[1:3, 2:4]

In [None]:
b[:, 2:4]

## What is Pandas?
* `data.frame` structures 
* Enables basic operations with data, sampling, group by, merge, ...
* Editing the form of data (data cleaning, reshaping, wrangling)
* Very easy basics of exploratory analysis and working with missing values

**What do we need Pandas for?**
* import data from standard formats
* clean up
* look at the data (statistics, sampling, basic graphs)
* shift data to analyze / train models

**Essential Tasks**
* Handling missing data (.dropna(), pd.isnull())
* Merge, join (concat, join)
* Group
* Changing the shape of the data (pivoting) (stack, pivot)
* Working with time series (resampling, timezones, ..)
* Drawing

**Pandas uses a Numpy array and built a `Series` and `DataFrame` type on top of it**

In [None]:
s = pd.Series([8, 6, 2, 3, 4])
s

In [None]:
# an explicit index is added to the numpy field
s.index

In [None]:
s.values

In [None]:
s[0]

In [None]:
# unlike numpy, however, the index can be something other than a number
s2 = pd.Series(np.arange(4), index=['a', 'b', 'c', 'd'])
s2

In [None]:
s2['c']

In [None]:
s2[2]

In [None]:
s2.c

In [None]:
# an associative array can also be used to create a Series object
population = pd.Series({'Germany': 81.3, 'Belgium': 11.3, 'France': 64.3, 'United Kingdom': 64.9, 'Netherlands': 16.9})
population

In [None]:
population['France']

In [None]:
# since it is built on Numpy, we can do all the interesting operations
population * 1000

In [None]:
# the index implicitly has a given order, so you can make a range
population['Belgium':'Netherlands']

In [None]:
population.mean()

In [None]:
population[['France', 'Netherlands']]

In [None]:
population[population > 20]

Well, `DataFrame` is actually a multidimensional `Series`

In [None]:
data = {'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'],
        'population': [11.3, 64.3, 81.3, 16.9, 64.9],
        'area': [30510, 671308, 357050, 41526, 244820],
        'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']}
countries = pd.DataFrame(data)
countries

In [None]:
countries.index

In [None]:
countries.columns

In [None]:
countries.values

In [None]:
countries.dtypes

In [None]:
countries.info()

In [None]:
countries.describe()

In [None]:
countries.describe(include='all')

In [None]:
countries = countries.set_index('country')
countries

and we can now very easily access individual columns

In [None]:
countries.area # countries['area']

In [None]:
countries['population']*1000000 / countries['area'] # population density

In [None]:
# we can easily create a new column
countries['density'] = countries['population']*1000000 / countries['area']
countries

In [None]:
# but there is another way to process all lines
countries['area_miles'] = countries.area.apply(lambda x: x / 2.58999)
countries['area_miles']

In [None]:
# if you would like to be able to work with several attributes at once
countries.apply(lambda x: x.area / 2.58999, axis=1)
# it could also be iterated through the columns if axis=0. It could be used, for example, for some calculation of statistics by columns

In [None]:
# and select rows based on it, for example
countries[countries['density'] > 300]

In [None]:
# we can then arrange, for example
countries.sort_values(by='density', ascending=False)

In [None]:
# NOTE: warning because there is a bug - they will fix it in time
# a very strong feature is linear rendering
# countries.density.plot()
# countries.density.plot(kind='bar')
countries.plot()

In [None]:
countries.plot(kind='scatter', x='population', y='area')

Since in `DataFrame` we have added the possibility to select columns by name, it became a bit more complicated for us to select elements compared to Numpy. We have to discern 
* selection by name a 
* according to position.

In [None]:
countries['area']

In [None]:
countries[['area', 'density']]

In [None]:
# but if we want a range, it accesses the lines
countries['France':'Netherlands']

For more advanced selection from the table, we use:
* `loc` a
* `iloc`

In [None]:
# access to a specific cell using row and column
countries.loc['Germany', 'area']

In [None]:
# ranges on both dimensions can also be used here
countries.loc['France':'Germany', :]

In [None]:
# but also appointment
countries.loc[countries['density']>300, ['capital', 'population']]

In [None]:
# iloc selects in order. This is similar to accessing elements as in Numpy
countries.iloc[0:2,1:3]

In [None]:
# of course, values ​​can still be assigned
countries.loc['Belgium':'Germany', 'population'] = 10
countries

## Data reshaping using Pandas

In [None]:
df = pd.DataFrame({'A':['one', 'one', 'two', 'two'], 'B':['a', 'b', 'a', 'b'], 'C':range(4)})
# df = pd.DataFrame({'A':['one', 'one', 'two', 'two'], 'B':['a', 'b', 'a', 'b' ], 'C':range(4), 'D':range(4)})
df

`unstack` moves values ​​in some column and creates column names from them

it often comes in handy if we have data that is in a slightly different form than we need

In [None]:
df = df.set_index(['A', 'B']) # najskor si vyberieme stlpec, ktory pouzijeme ako index. 
# The other one will add values ​​to the names of the new columns
df

In [None]:
# now we will tell each other in which column the values ​​are and let it be rearranged
result = df['C'].unstack()
result

### When might I need such a transformation?

Imagine that you have the logs of some application where you have the id of the user and the name of the action he performed. You want to know how many times each user performed different actions and plot it in an image.

In [None]:
row_count = 20
user_ids = np.random.choice([1,2,3,4], row_count)
actions = np.random.choice(['create', 'update', 'delete', 'retrieve', 'retrieve'], row_count)

df=pd.DataFrame({'user_id': user_ids, 'action': actions})
df.head()

In [None]:
pom = df.groupby(['user_id', 'action']).size().reset_index() # we will discuss groupby operation later
pom 

In [None]:
pom = pom.set_index(['user_id', 'action'])
pom

In [None]:
pom[0].unstack()
# pom[0].unstack(fill_value=0)

### The opposite operation is stack

In [None]:
# The opposite transformation is a stack. It takes column names and turns them into values
df = result.stack().reset_index(name='C')
df

This operation is used, for example, if you have multiple observations in one row and want to split them. For example, you have temperature on a row for different hours of the day, and you want to have only one temperature value on each row, and you want to have 24 rows for each day instead.

In [None]:
from datetime import datetime, timedelta
row_count = 5
index = [datetime.now().date() - timedelta(days=i) for i in range(row_count, 0, -1)]
data = dict(zip(range(24), np.random.rand(24, row_count) * 5 + 20))
df = pd.DataFrame(index=index, data=data)
df.head()

In [None]:
df.stack().reset_index(name='temperature')

## Similar to unstack, pivot also works
It's just a special case of the previous two functions, but it's easier to understand and you'll probably be fine with most things. Mainly as a replacement for unstack

[nice example with explanation](http://nikgrozev.com/2015/07/01/reshaping-in-pandas-pivot-pivot-table-stack-and-unstack-explained-with-pictures/)

In [None]:
# pivot is very similar to unstack, but let the names of the columns be set and there can be more of them
df = pd.DataFrame({'A':['one', 'one', 'two', 'two'], 'B':['a', 'b', 'a', 'b'], 'C':range(4)})
df

In [None]:
df.pivot(index='A', columns='B', values='C')

In [None]:
# pivot_table is similar to pivot, but can work with duplicate columns and lets you define an aggregation function
df = pd.DataFrame({'A':['one', 'one', 'two', 'two', 'one', 'two'], 'B':['a', 'b', 'a', 'b', 'a', 'b'], 'C':range(6)})
df

In [None]:
df.pivot_table(index='A', columns='B', values='C', aggfunc=np.sum) #aggfunct je defaultne np.mean

## Another common operation is groupby
you definitely know from SQL

In [None]:
df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
                   'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]})
df

In [None]:
df.groupby('key').aggregate('sum') # df.groupby('key').sum()

# A few other useful things when working with Pandas DataFrame

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python_reference/master/Data/some_soccer_data.csv')
df.head()

In [None]:
# renaming selected columns
df = df.rename(columns={'P': 'points', 
                        'GP': 'games',
                        'SOT': 'shots_on_target',
                        'G': 'goals',
                        'PPG': 'points_per_game',
                        'A': 'assists',})
df.head()

## Transformation of the values ​​in the column

In [None]:
df['SALARY'] = df['SALARY'].apply(lambda x: x.strip('$m'))
df.head()

## Adding an empty column

In [None]:
df['team'] = pd.Series('', index=df.index)
df['position'] = pd.Series('', index=df.index)
df.head()

## Transforming one column and filling several at once

In [None]:
def process_player_col(text):
    name, rest = text.split('\n')
    position, team = [x.strip() for x in rest.split(' — ')]
    return pd.Series([name, team, position])

df[['PLAYER', 'team', 'position']] = df.PLAYER.apply(process_player_col)
df.head()

In [None]:
df['bla'] = pd.Series('', index=df.index)

## Finding how many rows and columns have empty values

Number of rows with at least one blank value

In [None]:
df.shape[0] - df.dropna().shape[0]

The number of empty values ​​in the columns

In [None]:
df.isnull().sum()

The number of empty values ​​in the rows

In [None]:
df.isnull().sum(axis=1)

The total number of empty values ​​in the data

In [None]:
df.isnull().sum().sum()

## Selection of rows where there are empty values

Based on a single attribute

In [None]:
df[df['assists'].isnull()]

Rows with at least one empty value

In [None]:
df[df.isnull().any(axis=1)]

## Selection of full rows

Based on a single attribute

In [None]:
df[df['assists'].notnull()]
# df[~df['assists'].isnull()]

Based on all attributes

In [None]:
df[df.notnull().all(axis=1)]

Easier

In [None]:
df.dropna()

## Combining conditions
Note the brackets

In [None]:
df[ (df['team'] == 'Arsenal') | (df['team'] == 'Chelsea') ]

In [None]:
df[ (df['team'] == 'Arsenal') & (df['position'] == 'Forward') ]

## SQL in Pandas

In [None]:
from pandasql import sqldf

In [None]:
from pandasql import load_meat, load_births

meat = load_meat()
births = load_births()

In [None]:
type(meat)

In [None]:
meat.head()

In [None]:
births.head()

In [None]:
data = {'meat': meat}

In [None]:
sqldf('select * from meat limit 10', data)

In [None]:
data2 = {'meat2': meat}

In [None]:
sqldf('select * from meat2 limit 10', data2)

In [None]:
sqldf('select * from meat limit 10', locals())

In [None]:
sqldf('select * from births limit 10', locals())

In [None]:
q = """
    SELECT
        m.date
        , b.births
        , m.beef
    FROM
        meat m
    INNER JOIN
        births b
            on m.date = b.date
    ORDER BY
        m.date
    LIMIT 100;
    """

joined = sqldf(q, locals())
print(joined.head())

Pandasql runs on SQLite3, so you can do all classic operations in SQL here as well. Conditions, nested queries, joins, unions, functions, ...

# Literature

### Numpy
* 100 tasks also with sample solutions - https://github.com/rougier/numpy-100
* Other exercises with solutions - https://www.w3resource.com/python-exercises/numpy/index.php
* Cheat Sheet - https://www.datacamp.com/community/blog/python-numpy-cheat-sheet

### Pandas
* 100 puzzles with sample solutions - https://github.com/ajcr/100-pandas-puzzles
* Other exercises with solutions - https://www.w3resource.com/python-exercises/pandas/index.php
* Tutorial directly from the library documentation - https://pandas.pydata.org/pandas-docs/stable/tutorials.html 
* Exercises on various real datasets - https://github.com/guipsamora/pandas_exercises
* Pandas Cheat Sheet - https://www.datacamp.com/community/blog/python-pandas-cheat-sheet
* Data Wrangling in Python Cheat Sheet - https://www.datacamp.com/community/blog/pandas-cheat-sheet-python


### More very nice tutorials for working with data using Pandas
https://github.com/ResearchComputing/Meetup-Fall-2013

Selected specific parts that are of special interest to us:

* https://github.com/ResearchComputing/Meetup-Fall-2013/blob/master/python/lecture_10_pandas_introduction.ipynb
* https://github.com/ResearchComputing/Meetup-Fall-2013/blob/master/python/lecture_11_pandas_adding_data.ipynb
* https://github.com/ResearchComputing/Meetup-Fall-2013/blob/master/python/lecture_12_pandas_groupby.ipynb
* https://github.com/ResearchComputing/Meetup-Fall-2013/blob/master/python/lecture_13_pandas_movies.ipynb
* https://github.com/ResearchComputing/Meetup-Fall-2013/blob/master/python/lecture_14_pandas_reshape.ipynb
* https://github.com/ResearchComputing/Meetup-Fall-2013/blob/master/python/lecture_15_pandas_transforming.ipynb
* https://github.com/ResearchComputing/Meetup-Fall-2013/blob/master/python/lecture_21_pandas_processing.ipynb
* https://github.com/ResearchComputing/Meetup-Fall-2013/blob/master/python/lecture_22_pandas_cleaning.ipynb
* https://github.com/ResearchComputing/Meetup-Fall-2013/blob/master/python/lecture_23_titanic_example.ipynb