# PANDAS 
is a Python Library that basically functions like EXCEL. Instead of coding every single operation the Panda library has compiled most of them if few command and operands that help developers manipulate data more efficiently and quickly. Eventually it may become an even faster programming process.  

In [4]:
import pandas as pd

## Series

In [5]:
pd.Series([4, 6, -5, 3])

0    4
1    6
2   -5
3    3
dtype: int64

In [6]:
pd.Series([4, 6, -5, 3], index=['x1', 'x2', 'y1', 'y2'])

x1    4
x2    6
y1   -5
y2    3
dtype: int64

In [7]:
bounding_box = pd.Series([4, 6, -5, 3], index=['x1', 'x2', 'y1', 'y2'])
print(bounding_box['x1'])
bounding_box[['x1', 'y1']]

4


x1    4
y1   -5
dtype: int64

In [8]:
bounding_box / bounding_box.sum()

x1    0.500
x2    0.750
y1   -0.625
y2    0.375
dtype: float64

In [9]:
missing_data = pd.Series([4, 6, -5, None], index=['x1', 'x2', 'y1', 'y2'])

pd.isnull(missing_data)

x1    False
x2    False
y1    False
y2     True
dtype: bool

## DataFrame

In [10]:
import pandas as pd
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, usecols=[0])
df

Unnamed: 0,0
0,39
1,50
2,38
3,53
4,28
...,...
32556,27
32557,40
32558,58
32559,22


In [11]:
data = {
    'name': ['Maria', 'Carla', 'Juan', 'Ana', 'Sergio'],
    'age': [15, 33, 12, 21, 45],
    'gender': [True, True, False, True, False]
}

pd.DataFrame(data)

Unnamed: 0,name,age,gender
0,Maria,15,True
1,Carla,33,True
2,Juan,12,False
3,Ana,21,True
4,Sergio,45,False


## Read from csv

In [13]:
pd.read_csv('data/iris.data', header=None).head()

FileNotFoundError: [Errno 2] File data/iris.data does not exist: 'data/iris.data'

## Index

In [14]:
df = pd.DataFrame(data)
df.index

RangeIndex(start=0, stop=5, step=1)

In [15]:
df = pd.DataFrame(data)
df.index = ['m','c','j','a','s']
df

Unnamed: 0,name,age,gender
m,Maria,15,True
c,Carla,33,True
j,Juan,12,False
a,Ana,21,True
s,Sergio,45,False


## Indexing

In [16]:
a = pd.DataFrame(data, index=['m','c','j','a','s'])

a.loc['m'] == a.iloc[0]

name      True
age       True
gender    True
Name: m, dtype: bool

In [17]:
a.loc['m':'c']

Unnamed: 0,name,age,gender
m,Maria,15,True
c,Carla,33,True


## Dropping

In [18]:
df.drop('name', axis=1)

Unnamed: 0,age,gender
m,15,True
c,33,True
j,12,False
a,21,True
s,45,False


In [19]:
a.drop(['m', 'c'])

Unnamed: 0,name,age,gender
j,Juan,12,False
a,Ana,21,True
s,Sergio,45,False


## Duplicates

In [20]:
df = pd.DataFrame({'f1': [1, 2, 2, 1], 'f2': [0, 1, 1, 1]})
print(df)
df.drop_duplicates()

   f1  f2
0   1   0
1   2   1
2   2   1
3   1   1


Unnamed: 0,f1,f2
0,1,0
1,2,1
3,1,1


## Filtering

In [21]:
df = pd.DataFrame(data, index=['m','c','j','a','s'])
df[df.gender == False]

Unnamed: 0,name,age,gender
j,Juan,12,False
s,Sergio,45,False


## Custom element-wise operations

In [22]:
df['gender'] = df['gender'].apply(lambda x: not x)
df

Unnamed: 0,name,age,gender
m,Maria,15,False
c,Carla,33,False
j,Juan,12,True
a,Ana,21,False
s,Sergio,45,True


## Sorting

In [26]:
df.sort_index(by = 'age')

TypeError: sort_index() got an unexpected keyword argument 'by'

## Statistics

In [24]:
df.describe()

Unnamed: 0,age
count,5.0
mean,25.2
std,13.682105
min,12.0
25%,15.0
50%,21.0
75%,33.0
max,45.0


## Counts

In [31]:
df['gender'].value_counts()

False    3
True     2
Name: gender, dtype: int64