# Data Munging Basics
## Segment 4 - Concatenating and transforming data

In [3]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

In [4]:
df_object = DataFrame(np.arange(36).reshape(6, 6))
df_object

Unnamed: 0,0,1,2,3,4,5
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29
5,30,31,32,33,34,35


In [5]:
df_object2 = DataFrame(np.arange(15).reshape(5, 3))
df_object2

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
4,12,13,14


### Concatenating data

In [6]:
# The concat method joins two data frames, passing the axis = 1 argument the method join on the row indexes
# Here we can see that the column values have persisted and where there is no matching row index, NaN is used
pd.concat([df_object, df_object2], axis=1)

Unnamed: 0,0,1,2,3,4,5,0.1,1.1,2.1
0,0,1,2,3,4,5,0.0,1.0,2.0
1,6,7,8,9,10,11,3.0,4.0,5.0
2,12,13,14,15,16,17,6.0,7.0,8.0
3,18,19,20,21,22,23,9.0,10.0,11.0
4,24,25,26,27,28,29,12.0,13.0,14.0
5,30,31,32,33,34,35,,,


In [7]:
# If we want to join the data frames based on the columns then omit the the axis = 1 argument
# This time the row indexes are persisted
pd.concat([df_object, df_object2])

Unnamed: 0,0,1,2,3,4,5
0,0,1,2,3.0,4.0,5.0
1,6,7,8,9.0,10.0,11.0
2,12,13,14,15.0,16.0,17.0
3,18,19,20,21.0,22.0,23.0
4,24,25,26,27.0,28.0,29.0
5,30,31,32,33.0,34.0,35.0
0,0,1,2,,,
1,3,4,5,,,
2,6,7,8,,,
3,9,10,11,,,


### Transforming data
#### Dropping data

In [8]:
# Drop specific rows using the drop() method
df_object.drop([2,3])

Unnamed: 0,0,1,2,3,4,5
0,0,1,2,3,4,5
1,6,7,8,9,10,11
4,24,25,26,27,28,29
5,30,31,32,33,34,35


In [9]:
# Drop specific columns by setting the axis = 1 arg to 1
df_object.drop([2,3], axis = 1)

Unnamed: 0,0,1,4,5
0,0,1,4,5
1,6,7,10,11
2,12,13,16,17
3,18,19,22,23
4,24,25,28,29
5,30,31,34,35


### Adding data

In [10]:
# We can name a data series by setting the name property on the series object
series_object = Series(np.arange(6))
series_object.name = 'added_variable'
series_object

0    0
1    1
2    2
3    3
4    4
5    5
Name: added_variable, dtype: int64

In [11]:
# We can join a series object to a dataframe using the join method (adding new columns)
# The name of the new column will be the name of the series
DataFrame.join(df_object, series_object)

Unnamed: 0,0,1,2,3,4,5,added_variable
0,0,1,2,3,4,5,0
1,6,7,8,9,10,11,1
2,12,13,14,15,16,17,2
3,18,19,20,21,22,23,3
4,24,25,26,27,28,29,4
5,30,31,32,33,34,35,5


In [12]:
# We can append (add new rows) to a dataframe using the append method
# Here we add a dataframe to itself
# Note that the new dtframe retains all the old indexes of the initial two dataframes
df_object.append(df_object)

Unnamed: 0,0,1,2,3,4,5
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29
5,30,31,32,33,34,35
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23


In [13]:
# Setting the ignore index arument to true will reindex the resultant dataframes
df_object.append(df_object, ignore_index = True)

Unnamed: 0,0,1,2,3,4,5
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29
5,30,31,32,33,34,35
6,0,1,2,3,4,5
7,6,7,8,9,10,11
8,12,13,14,15,16,17
9,18,19,20,21,22,23


### Sorting data

In [14]:
# We can sort by a specific column using the sort_values() method
# The by argument is the column or columns you wish to sort by
df_object.sort_values(by = [5])

Unnamed: 0,0,1,2,3,4,5
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29
5,30,31,32,33,34,35


In [16]:
# We can can sort in descending order by setting the acsending arg to false
df_object.sort_values(by = [5], ascending = False)

Unnamed: 0,0,1,2,3,4,5
5,30,31,32,33,34,35
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5
