# 02 Data manipulation with Pandas

## Libraries

In [None]:
import pandas as pd
import numpy as np

## Creating DataFrames

In [None]:
# Specify values for each column
df = pd.DataFrame(
{"a" : [4 ,5, 6],
"b" : [7, 8, 9],
"c" : [10, 11, 12]},
index = [1, 2, 3]
)

display(df)

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [None]:
# Specify values for each row
df = pd.DataFrame(
[[4, 7, 10],
[5, 8, 11],
[6, 9, 12]],
index=[1, 2, 3],
columns=['a', 'b', 'c']
)
display(df)

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [None]:
#Create DataFrame with a MultiIndex
df = pd.DataFrame(
{"a" : [4 ,5, 6],
"b" : [7, 8, 9],
"c" : [10, 11, 12]},
index = pd.MultiIndex.from_tuples(
[('d',1),('d',2),('e',2)],
names=['n','v']))

print("DataFrame with a MultiIndex")
display(df)

#with reset_index() you can flatten the dataframe
df_reset = df.reset_index()
print("\ndf_reset")
display(df_reset)

DataFrame with a MultiIndex


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12



df_reset


Unnamed: 0,n,v,a,b,c
0,d,1,4,7,10
1,d,2,5,8,11
2,e,2,6,9,12


In [None]:
some_list = ['foo', 'bar', 'baz']

mapping = {}
for i, v in enumerate(some_list):
  mapping[v] = i

mapping


{'bar': 1, 'baz': 2, 'foo': 0}

## Reshaping

Reshaping by pivoting DataFrame objects

### Pivot tables
While ``pivot()`` provides general purpose pivoting with various data type (strings, numerics, etc.), pandas also provides ``pivot_table()`` for pivoting with aggregation of numeric data. The function ``pivot_table()`` can be used to create spreadsheet-style pivot tables.

It takes a number of arguments:

* **data:** a DataFrame object.
* **index:** a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values.
* **columns:** a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values.
* **values:** a column or a list of columns to aggregate.
* **aggfunc:** function to use for aggregation, defaulting to numpy.mean.

```python
df.pivot_table(
  index= '',
  columns= [],
  values= [],
  aggfunc = [np.mean])
```

### Cross tabulations
Use crosstab() to compute a cross-tabulation of two (or more) factors. By default crosstab computes a frequency table of the factors unless an array of values and an aggregation function are passed.

It takes a number of arguments

index: array-like, values to group by in the rows.

columns: array-like, values to group by in the columns.

values: array-like, optional, array of values to aggregate according to the factors.

aggfunc: function, optional, If no values array is passed, computes a frequency table.

rownames: sequence, default None, must match number of row arrays passed.

colnames: sequence, default None, if passed, must match number of column arrays passed.

margins: boolean, default False, Add row/column margins (subtotals)

normalize: boolean, {‘all’, ‘index’, ‘columns’}, or {0,1}, default False. Normalize by dividing all values by the sum of values.

Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified

## Datatypes: categorical

### Ordered categories

In [None]:
# Create a list of weather ratings in logical order
category_order = ['good', 'bad', 'worse']

# Change the data type of 'rating' to category
weather['rating'] = weather.rating.astype('category', ordered=True, categories=category_order)

# Examine the head of 'rating'
print(weather.rating.head())

NameError: ignored

In [None]:
# Count the unique values in 'bad_conditions' and sort the index
print(weather.bad_conditions.value_counts().sort_index())

# Create a dictionary that maps integers to strings
mapping = {0:'good', 1:'bad', 2:'bad', 3:'bad', 4:'bad', 5:'worse', 6:'worse', 7:'worse', 8:'worse', 9:'worse'}

# Convert the 'bad_conditions' integers to strings using the 'mapping'
weather['rating'] = weather.bad_conditions.map(mapping)

# Count the unique values in 'rating'
print(weather.rating.value_counts())