## Pandas -> Fast data manipulation 

In [2]:
import numpy as np 
import pandas as pd

# Series

In [3]:
# diffrent data types 1.Object(string) 2.int 3.float
# index   key   value
#   0     'a'     2
#   1     'm'     7
#   2     'c'     1
#   3     'w'     3
#   4     'q'     5

s = pd.Series(['a','b','c'],index=['first','second','third'])
s

first     a
second    b
third     c
dtype: object

## index and values

In [4]:
# index of a series
s.index

Index(['first', 'second', 'third'], dtype='object')

In [5]:
# values of a series
s.values

array(['a', 'b', 'c'], dtype=object)

In [6]:
# set the index to diffrenet values
s = pd.Series(['Football','Basket','Rugby'],index=['UK','USA','Canda'],name = 'Sports')
s

UK       Football
USA        Basket
Canda       Rugby
Name: Sports, dtype: object

## Create Series With Dictionary

In [7]:
ls = {'UK':'FootBall','USA':'Basket','Canda':'Rugby'}
ls = pd.Series(ls)
(ls)

UK       FootBall
USA        Basket
Canda       Rugby
dtype: object

## loc and iloc

In [8]:
ls.iloc[:2]

UK     FootBall
USA      Basket
dtype: object

In [9]:
# add element in series
ls.loc['Egypt'] = 'Karate'
ls

UK       FootBall
USA        Basket
Canda       Rugby
Egypt      Karate
dtype: object

### Vectorization

In [10]:
# create random series

rand = pd.Series(np.random.randint(0,1000,100000))
rand

0        372
1        324
2        108
3        670
4        943
        ... 
99995    815
99996    622
99997    922
99998    685
99999    434
Length: 100000, dtype: int32

###  calculate time in for loop

In [11]:
%%timeit -n 100

summary = 0
for i in rand:
    summary+=i

15 ms ± 353 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### calculate time using vectorization

In [12]:
%%timeit -n 100

summary = np.sum(rand)

276 µs ± 7.11 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# DataFrame

In [13]:
# create DataFrame with students info

s1 = pd.Series({'Name':'Ahmed','Age':20,'GPA':3.9})
s2 = pd.Series({'Name':'mohamed','Age':22,'GPA':3.5})
s3 = pd.Series({'Name':'Ali','Age':23,'GPA':3.8})
s4 = pd.Series({'Name':'Moaaz','Age':21,'GPA':3.35})
s5 = pd.Series({'Name':'Yehia','Age':21,'GPA':3.33})
s6 = pd.Series({'Name':'Khaled','Age':23,'GPA':3.4})

df = pd.DataFrame([s1,s2,s3,s4,s5,s6],columns=['Name','Age','GPA'])
df

Unnamed: 0,Name,Age,GPA
0,Ahmed,20,3.9
1,mohamed,22,3.5
2,Ali,23,3.8
3,Moaaz,21,3.35
4,Yehia,21,3.33
5,Khaled,23,3.4


In [14]:
# select coloumns
df['GPA']


0    3.90
1    3.50
2    3.80
3    3.35
4    3.33
5    3.40
Name: GPA, dtype: float64

In [15]:
# select rows
df.loc[1]

Name    mohamed
Age          22
GPA         3.5
Name: 1, dtype: object

In [16]:
# select element
df.loc[2,'GPA']

3.8

In [17]:
df['Age'].apply(lambda x: x-10)

0    10
1    12
2    13
3    11
4    11
5    13
Name: Age, dtype: int64

In [53]:
# Operations on columns using lambda functions
grades = {4.0:'A',3.0:'B'}

df['Grade'] = df['GPA'].apply(lambda x: grades[np.round(x)])
df

Unnamed: 0,Name,Age,GPA,Grade
0,Ahmed,10,3.9,A
1,mohamed,12,3.5,A
2,Ali,13,3.8,A
3,Moaaz,11,3.35,B
4,Yehia,11,3.33,B
5,Khaled,13,3.4,B


In [18]:
# using np.mean with pandas
np.mean(df['GPA'],axis=0)

3.5466666666666664