# Pandas - Dataframe and Series
### Pandas is a powerful data manipulation library of python, widely used for data analysis and data cleaning.

In [3]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import numpy as np

## Series
### A Pandas series is a one dimensional object that can hold any data type. It is similar to column in table.

In [10]:
data = [1, 2, 3, 4, 5]
series = pd.Series(data)
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [None]:
arr1 = np.array([1, 2, 3, 4, 5])
series = pd.Series(arr1)
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [21]:
# Create a Series from dictionary
dict = {
    'a': 1,
    'b': 2,
    'c': 3
}
series = pd.Series(dict)
print(series)

a    1
b    2
c    3
dtype: int64


In [25]:
arr2 = [2, 4, 6, 8, 10]
index = [1, 2, 3, 4, 5]

series = pd.Series(arr2, index)
print(series)

1     2
2     4
3     6
4     8
5    10
dtype: int64


## DataFrame
### 2-Dimensional, Heterogenous data

In [26]:
# Create a Dataframe from a dictionary of list
data = {
    'Name': ['Gagan', 'Sachin', 'Hasibul', 'Amit Kumar', 'Amit Patel'],
    'Role': ['Backend Developer', 'Frontend Developer', 'Backend Developer', 'App Developer', 'Frontend Developer'],
    'Organization': ['Primberry', 'Primberry', 'Primberry', 'Primberry', 'Primberry']
}

data_frame = pd.DataFrame(data)
print(data_frame)

         Name                Role Organization
0       Gagan   Backend Developer    Primberry
1      Sachin  Frontend Developer    Primberry
2     Hasibul   Backend Developer    Primberry
3  Amit Kumar       App Developer    Primberry
4  Amit Patel  Frontend Developer    Primberry


In [27]:
data = [
    {'Name': 'Gagan Chauhan', 'Age': 23},
    {'Name': 'Sanjeev Chauhan', 'Age': 45}
]

data_frame = pd.DataFrame(data)
print(data_frame)

              Name  Age
0    Gagan Chauhan   23
1  Sanjeev Chauhan   45


In [None]:
# Import CSV file and find top 2 elements.
file = pd.read_csv('data_file.csv')
file.head(2)

Unnamed: 0,Series_reference,Period,Data_value,Suppressed,STATUS,UNITS,Magnitude,Subject,Group,Series_title_1,Series_title_2,Series_title_3,Series_title_4,Series_title_5
0,BDCQ.SEA1AA,2011.06,80078.0,,F,Number,0,Business Data Collection - BDC,Industry by employment variable,Filled jobs,"Agriculture, Forestry and Fishing",Actual,,
1,BDCQ.SEA1AA,2011.09,78324.0,,F,Number,0,Business Data Collection - BDC,Industry by employment variable,Filled jobs,"Agriculture, Forestry and Fishing",Actual,,


In [31]:
file.tail(2)

Unnamed: 0,Series_reference,Period,Data_value,Suppressed,STATUS,UNITS,Magnitude,Subject,Group,Series_title_1,Series_title_2,Series_title_3,Series_title_4,Series_title_5
24896,BDCQ.SEE3999A,2018.03,,Y,F,Number,0,Business Data Collection - BDC,Territorial authority by employment variable,Filled jobs (workplace location based),Area Outside Territorial Authority,Actual,,
24897,BDCQ.SEE3999A,2018.06,,Y,F,Number,0,Business Data Collection - BDC,Territorial authority by employment variable,Filled jobs (workplace location based),Area Outside Territorial Authority,Actual,,


### Accessing Data from DataFrame

In [37]:
data_frame

Unnamed: 0,Name,Age
0,Gagan Chauhan,23
1,Sanjeev Chauhan,45


In [39]:
data_frame['Name']

0      Gagan Chauhan
1    Sanjeev Chauhan
Name: Name, dtype: object

In [46]:
print(data_frame.loc[0])
print(data_frame.loc[1]['Name'])

Name    Gagan Chauhan
Age                23
Name: 0, dtype: object
Sanjeev Chauhan


In [51]:
# Accessing a specified element
print(data_frame.at[1, "Age"])

45


In [None]:
# Data manipulation with dataframe
# Add a column:
data_frame['Gender'] = ['Male', 'Male']
print(data_frame)

In [None]:
# Remove a column:
data_frame.drop('Gender', axis=1) # If you want to check for all dataframe "axis = 1" - Corrently it not removing permanently.

Unnamed: 0,Name,Age
0,Gagan Chauhan,23
1,Sanjeev Chauhan,45


In [58]:
data_frame.drop('Gender', axis=1, inplace=True) # It remove 'Gender' column completely

In [59]:
data_frame

Unnamed: 0,Name,Age
0,Gagan Chauhan,23
1,Sanjeev Chauhan,45


In [None]:
file = pd.read_csv('data_file.csv')
file.describe()

Unnamed: 0,Period,Data_value,Magnitude,Series_title_4,Series_title_5
count,24898.0,22059.0,24898.0,0.0,0.0
mean,2017.692457,73580.11,2.270303,,
std,4.063672,212979.8,2.909963,,
min,2011.06,1.171797,0.0,,
25%,2014.09,1935.656,0.0,,
50%,2018.03,14190.0,0.0,,
75%,2021.06,60992.0,6.0,,
max,2025.06,2321295.0,6.0,,
