In [1]:
# coding: utf-8

In [2]:
import numpy as np

In [3]:
arr = np.array([1,3,4,5,6])

In [4]:
arr

array([1, 3, 4, 5, 6])

In [5]:
arr.shape

(5,)

In [6]:
arr.dtype

dtype('int32')

In [7]:
arr = np.array([1,'st','er',3])
arr.dtype

dtype('<U11')

### Creating arrays

In [10]:
arr = np.array([[1,2,3],[2,4,6],[8,8,8]])

In [11]:
arr.shape

(3, 3)

In [12]:
arr


array([[1, 2, 3],
       [2, 4, 6],
       [8, 8, 8]])

In [13]:
arr = np.zeros((2,4))


In [14]:
arr

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [15]:
arr = np.ones((2,4))

In [16]:
arr

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [17]:
arr = np.identity(3)

In [18]:
arr

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [19]:
arr = np.random.randn(3,4)

In [20]:
arr


array([[ 0.82292928,  0.4090167 ,  1.54427878,  0.64890718],
       [ 0.10106356, -0.27608708, -0.49987026, -0.45418089],
       [-0.81337167, -0.89337351, -1.39869748,  0.57430765]])

In [21]:
from io import BytesIO
b = BytesIO(b"2,23,33\n32,42,63.4\n35,77,12")
arr = np.genfromtxt(b, delimiter=",")
arr

array([[ 2. , 23. , 33. ],
       [32. , 42. , 63.4],
       [35. , 77. , 12. ]])

## Accessing array elements

###  Simple indexing

In [22]:
arr[1]

array([32. , 42. , 63.4])

In [23]:
arr = np.arange(12).reshape(2,2,3)

In [24]:
arr

array([[[ 0,  1,  2],
        [ 3,  4,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]]])

In [25]:
arr[0]

array([[0, 1, 2],
       [3, 4, 5]])

In [26]:
arr = np.arange(10)

In [27]:
arr[5:]

array([5, 6, 7, 8, 9])

In [28]:
arr[5:8]

array([5, 6, 7])

In [29]:
arr[:-5]

array([0, 1, 2, 3, 4])

In [30]:
arr = np.arange(12).reshape(2,2,3)

In [31]:
arr

array([[[ 0,  1,  2],
        [ 3,  4,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]]])

In [32]:
arr[1:2]

array([[[ 6,  7,  8],
        [ 9, 10, 11]]])

In [33]:
arr = np.arange(27).reshape(3,3,3)

In [34]:
arr

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

In [35]:
arr[:,:,2]

array([[ 2,  5,  8],
       [11, 14, 17],
       [20, 23, 26]])

In [36]:
arr[...,2]

array([[ 2,  5,  8],
       [11, 14, 17],
       [20, 23, 26]])

### Advanced Indexing

In [37]:
arr = np.arange(9).reshape(3,3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [38]:
arr[[0,1,2],[1,0,0]]


array([1, 3, 6])

### Boolean Indexing

In [39]:
import pandas as pd
import numpy as np


In [40]:
data = pd.read_csv("simplemaps-worldcities-basic.csv")

In [41]:
cities = np.array(["delhi","banglaore","mumbai","chennai","bhopal"])
city_data = np.random.randn(5,3)
city_data


array([[-0.09299425, -1.34621711,  0.75390579],
       [ 1.96347432,  0.53472954, -0.7719287 ],
       [-0.94053816,  2.35307016,  0.57715153],
       [-0.15128236, -0.43055324, -0.24972309],
       [ 1.30217324,  2.08360003, -0.9524947 ]])

In [42]:
city_data[cities =="delhi"]

array([[-0.09299425, -1.34621711,  0.75390579]])

In [43]:
city_data[city_data >0]

array([0.75390579, 1.96347432, 0.53472954, 2.35307016, 0.57715153,
       1.30217324, 2.08360003])

In [44]:
city_data[city_data >0] = 0
city_data


array([[-0.09299425, -1.34621711,  0.        ],
       [ 0.        ,  0.        , -0.7719287 ],
       [-0.94053816,  0.        ,  0.        ],
       [-0.15128236, -0.43055324, -0.24972309],
       [ 0.        ,  0.        , -0.9524947 ]])

### Operations on arrays

In [45]:
arr = np.arange(15).reshape(3,5)
arr


array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [46]:
arr + 5

array([[ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [47]:
arr * 2

array([[ 0,  2,  4,  6,  8],
       [10, 12, 14, 16, 18],
       [20, 22, 24, 26, 28]])

In [48]:
arr1 = np.arange(15).reshape(5,3)
arr2 = np.arange(5).reshape(5,1)
arr2 + arr1


array([[ 0,  1,  2],
       [ 4,  5,  6],
       [ 8,  9, 10],
       [12, 13, 14],
       [16, 17, 18]])

In [49]:
arr1

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [50]:
arr2

array([[0],
       [1],
       [2],
       [3],
       [4]])

In [51]:
arr1 = np.random.randn(5,3)
arr1


array([[-0.91239918,  1.19610002,  2.24311082],
       [ 0.25323815,  0.19013471,  0.40469988],
       [-1.41735031,  0.26662742, -0.07076245],
       [ 1.1459409 ,  1.10810919,  0.80212038],
       [-0.92412545, -1.09995435, -1.14962975]])

In [52]:
np.modf(arr1)

(array([[-0.91239918,  0.19610002,  0.24311082],
        [ 0.25323815,  0.19013471,  0.40469988],
        [-0.41735031,  0.26662742, -0.07076245],
        [ 0.1459409 ,  0.10810919,  0.80212038],
        [-0.92412545, -0.09995435, -0.14962975]]), array([[-0.,  1.,  2.],
        [ 0.,  0.,  0.],
        [-1.,  0., -0.],
        [ 1.,  1.,  0.],
        [-0., -1., -1.]]))

### Linear algebra using numpy

In [53]:
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
B = np.array([[9,8,7],[6,5,4],[1,2,3]])


In [54]:
A.dot(B)

array([[ 24,  24,  24],
       [ 72,  69,  66],
       [120, 114, 108]])

In [55]:
A = np.arange(15).reshape(3,5)
A.T


array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [56]:
np.linalg.svd(A)

(array([[-0.15425367,  0.89974393,  0.40824829],
        [-0.50248417,  0.28432901, -0.81649658],
        [-0.85071468, -0.3310859 ,  0.40824829]]),
 array([3.17420265e+01, 2.72832424e+00, 3.59372947e-16]),
 array([[-0.34716018, -0.39465093, -0.44214167, -0.48963242, -0.53712316],
        [-0.69244481, -0.37980343, -0.06716206,  0.24547932,  0.55812069],
        [-0.41088392,  0.70864929,  0.02536563, -0.53314345,  0.21001245],
        [-0.40770221,  0.39487291, -0.09099898,  0.62818809, -0.5243598 ],
        [ 0.25485938,  0.20467429, -0.8894244 ,  0.14538841,  0.28450232]]))

In [57]:
a = np.array([[7,5,-3], [3,-5,2],[5,3,-7]])
b = np.array([16,-8,0])
x = np.linalg.solve(a, b)
x


array([1., 3., 2.])

In [58]:
np.allclose(np.dot(a, x), b)

True

In [59]:
import pandas as pd
d =  [{'city':'Delhi',"data":1000},
      {'city':'Banglaore',"data":2000},
      {'city':'Mumbai',"data":1000}]
pd.DataFrame(d)


Unnamed: 0,city,data
0,Delhi,1000
1,Banglaore,2000
2,Mumbai,1000


In [61]:
df = pd.DataFrame(d)


In [64]:
import numpy as np
import pandas as pd
data = pd.read_csv("simplemaps-worldcities-basic.csv")

In [66]:
data.head(n=10)


Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333,2997.0,Afghanistan,AF,AFG,Badghis
1,Chaghcharan,Chaghcharan,34.516701,65.250001,15000.0,Afghanistan,AF,AFG,Ghor
2,Lashkar Gah,Lashkar Gah,31.582998,64.36,201546.0,Afghanistan,AF,AFG,Hilmand
3,Zaranj,Zaranj,31.112001,61.886998,49851.0,Afghanistan,AF,AFG,Nimroz
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699,10000.0,Afghanistan,AF,AFG,Uruzgan
5,Zareh Sharan,Zareh Sharan,32.85,68.416705,13737.0,Afghanistan,AF,AFG,Paktika
6,Asadabad,Asadabad,34.866,71.150005,48400.0,Afghanistan,AF,AFG,Kunar
7,Taloqan,Taloqan,36.729999,69.540004,64256.0,Afghanistan,AF,AFG,Takhar
8,Mahmud-E Eraqi,Mahmud-E Eraqi,35.016696,69.333301,7407.0,Afghanistan,AF,AFG,Kapisa
9,Mehtar Lam,Mehtar Lam,34.65,70.166701,17345.0,Afghanistan,AF,AFG,Laghman


In [67]:
data.tail()

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
7317,Mutare,Mutare,-18.970019,32.650038,216785.0,Zimbabwe,ZW,ZWE,Manicaland
7318,Kadoma,Kadoma,-18.330006,29.909947,56400.0,Zimbabwe,ZW,ZWE,Mashonaland West
7319,Chitungwiza,Chitungwiza,-18.000001,31.100003,331071.0,Zimbabwe,ZW,ZWE,Harare
7320,Harare,Harare,-17.81779,31.044709,1557406.5,Zimbabwe,ZW,ZWE,Harare
7321,Bulawayo,Bulawayo,-20.169998,28.580002,697096.0,Zimbabwe,ZW,ZWE,Bulawayo


In [68]:
series_es = data.lat

In [69]:
type(series_es)


pandas.core.series.Series

In [70]:
series_es[1:10:2]

1    34.516701
3    31.112001
5    32.850000
7    36.729999
9    34.650000
Name: lat, dtype: float64

In [71]:
series_es[:7]

0    34.983000
1    34.516701
2    31.582998
3    31.112001
4    32.633298
5    32.850000
6    34.866000
Name: lat, dtype: float64

In [72]:
series_es[:-7315]


0    34.983000
1    34.516701
2    31.582998
3    31.112001
4    32.633298
5    32.850000
6    34.866000
Name: lat, dtype: float64

In [73]:
data[:7]


Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333,2997.0,Afghanistan,AF,AFG,Badghis
1,Chaghcharan,Chaghcharan,34.516701,65.250001,15000.0,Afghanistan,AF,AFG,Ghor
2,Lashkar Gah,Lashkar Gah,31.582998,64.36,201546.0,Afghanistan,AF,AFG,Hilmand
3,Zaranj,Zaranj,31.112001,61.886998,49851.0,Afghanistan,AF,AFG,Nimroz
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699,10000.0,Afghanistan,AF,AFG,Uruzgan
5,Zareh Sharan,Zareh Sharan,32.85,68.416705,13737.0,Afghanistan,AF,AFG,Paktika
6,Asadabad,Asadabad,34.866,71.150005,48400.0,Afghanistan,AF,AFG,Kunar


In [74]:
data.iloc[:5,:4]

Unnamed: 0,city,city_ascii,lat,lng
0,Qal eh-ye Now,Qal eh-ye,34.983,63.1333
1,Chaghcharan,Chaghcharan,34.516701,65.250001
2,Lashkar Gah,Lashkar Gah,31.582998,64.36
3,Zaranj,Zaranj,31.112001,61.886998
4,Tarin Kowt,Tarin Kowt,32.633298,65.866699


In [75]:
data[data['pop'] > 10000000][data.columns[pd.Series(data.columns).str.startswith('l')]]

Unnamed: 0,lat,lng
360,-34.602502,-58.397531
1171,-23.55868,-46.62502
2068,31.216452,121.436505
3098,28.669993,77.230004
3110,19.01699,72.856989
3492,35.685017,139.751407
4074,19.442442,-99.130988
4513,24.869992,66.990009
5394,55.752164,37.615523
6124,41.104996,29.010002


In [76]:
greater_10mil = data[data['pop'] > 10000000]
greater_10mil.rename(columns={'pop':'population'}, inplace=True)
greater_10mil.where(greater_10mil.population > 15000000)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,city,city_ascii,lat,lng,population,country,iso2,iso3,province
360,,,,,,,,,
1171,,,,,,,,,
2068,,,,,,,,,
3098,,,,,,,,,
3110,Mumbai,Mumbai,19.01699,72.856989,15834918.0,India,IN,IND,Maharashtra
3492,Tokyo,Tokyo,35.685017,139.751407,22006299.5,Japan,JP,JPN,Tokyo
4074,,,,,,,,,
4513,,,,,,,,,
5394,,,,,,,,,
6124,,,,,,,,,


In [77]:
df = pd.DataFrame(np.random.randn(8, 3),
columns=['A', 'B', 'C'])


### Operations on dataframes

In [78]:
nparray = df.values
type(nparray)


numpy.ndarray

In [79]:
from numpy import nan
df.iloc[4,2] = nan


In [80]:
df

Unnamed: 0,A,B,C
0,-0.502403,-1.156581,-0.710037
1,0.755583,1.242302,1.849272
2,-0.582948,-2.023372,0.087031
3,-0.131867,0.093404,0.185124
4,0.651575,-0.013404,
5,-0.772821,1.317763,1.14225
6,0.722332,-0.01364,-1.798752
7,0.137319,-0.197983,-1.152022


In [81]:
df.fillna(0)

Unnamed: 0,A,B,C
0,-0.502403,-1.156581,-0.710037
1,0.755583,1.242302,1.849272
2,-0.582948,-2.023372,0.087031
3,-0.131867,0.093404,0.185124
4,0.651575,-0.013404,0.0
5,-0.772821,1.317763,1.14225
6,0.722332,-0.01364,-1.798752
7,0.137319,-0.197983,-1.152022


In [82]:
columns_numeric = ['lat','lng','pop']

In [83]:
data[columns_numeric].mean()

lat        20.662876
lng        10.711914
pop    265463.071633
dtype: float64

In [84]:
data[columns_numeric].sum()

lat    1.512936e+05
lng    7.843263e+04
pop    1.943721e+09
dtype: float64

In [85]:
data[columns_numeric].count()

lat    7322
lng    7322
pop    7322
dtype: int64

In [86]:
data[columns_numeric].median()


lat       26.792730
lng       18.617509
pop    61322.750000
dtype: float64

In [87]:
data[columns_numeric].quantile(0.8)

lat        46.852480
lng        89.900018
pop    269210.000000
Name: 0.8, dtype: float64

In [88]:
data[columns_numeric].sum(axis = 1).head()


0      3095.116300
1     15099.766702
2    201641.942998
3     49943.998999
4     10098.499997
dtype: float64

In [89]:
data[columns_numeric].describe()

Unnamed: 0,lat,lng,pop
count,7322.0,7322.0,7322.0
mean,20.662876,10.711914,265463.1
std,29.134818,79.044615,828762.2
min,-89.982894,-179.589979,-99.0
25%,-0.32471,-64.788472,17344.25
50%,26.79273,18.617509,61322.75
75%,43.575448,73.103628,200172.6
max,82.483323,179.383304,22006300.0


In [90]:
data1 = data.sample(3)

### Concatanating data frames


In [91]:
data2 = data.sample(3)
data_combine = pd.concat([data1,data2])
data_combine


Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
6688,Temple,Temple,31.102093,-97.363008,58432.0,United States of America,US,USA,Texas
4379,Numan,Numan,9.460442,12.04003,45173.0,Nigeria,NG,NGA,Adamawa
4112,Er Rachidia,Er Rachidia,31.940413,-4.449972,228489.0,Morocco,MA,MAR,Meknès - Tafilalet
7159,Upata,Upata,8.020426,-62.41,53474.5,Venezuela,VE,VEN,Bolívar
4594,Ferrenafe,Ferrenafe,-6.629997,-79.800023,42270.5,Peru,PE,PER,Lambayeque
6272,Lisburn,Lisburn,54.520379,-6.670017,12899.0,United Kingdom,GB,GBR,Dungannon


In [92]:
df1 = pd.DataFrame({'col1': ['col10', 'col11', 'col12', 'col13'],
                    'col2': ['col20', 'col21', 'col22', 'col23'],
                    'col3': ['col30', 'col31', 'col32', 'col33'],
                    'col4': ['col40', 'col41', 'col42', 'col43']},
                   index=[0, 1, 2, 3])


In [93]:
df1

Unnamed: 0,col1,col2,col3,col4
0,col10,col20,col30,col40
1,col11,col21,col31,col41
2,col12,col22,col32,col42
3,col13,col23,col33,col43


In [94]:
df4 = pd.DataFrame({'col2': ['col22', 'col23', 'col26', 'col27'],
                    'Col4': ['Col42', 'Col43', 'Col46', 'Col47'],
                    'col6': ['col62', 'col63', 'col66', 'col67']},
                   index=[2, 3, 6, 7])


In [95]:
pd.concat([df1,df4], axis=1)

Unnamed: 0,col1,col2,col3,col4,col2.1,Col4,col6
0,col10,col20,col30,col40,,,
1,col11,col21,col31,col41,,,
2,col12,col22,col32,col42,col22,Col42,col62
3,col13,col23,col33,col43,col23,Col43,col63
6,,,,,col26,Col46,col66
7,,,,,col27,Col47,col67


In [97]:
data = data[['iso3','country']].drop_duplicates()


In [98]:
data.shape

(223, 2)

In [100]:
data.head()

Unnamed: 0,iso3,country
0,AFG,Afghanistan
33,ALD,Aland
34,ALB,Albania
60,DZA,Algeria
111,ASM,American Samoa


In [101]:
del(data['country'])


In [102]:
data.merge(data, 'inner').head()

Unnamed: 0,iso3
0,AFG
1,ALD
2,ALB
3,DZA
4,ASM
