## Built In Methods in Numpy

In [2]:
import numpy as np
from numpy import random

## Creating Arrays
NumPy is used to work with arrays. The array object in NumPy is called ndarray.
One can create a NumPy ndarray object by using the array() function.

In [None]:
# 1. Creating arrays

arr = np.array([1, 2, 3, 4, 5])

print(arr)

print(type(arr))

[1 2 3 4 5]
<class 'numpy.ndarray'>


## Shape Function
The shape of an array is the number of elements in each dimension.  An attribute called shape returns a tuple with each index having the number of corresponding elements

In [None]:
# How to find the shape of an array
a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

print(a.shape)

(2, 4)


## Concatenate
The concatenate() function join  two or more arrays in a single array

In [None]:
# Using of concatenate
arr1 = np.array([1, 2, 3])

arr2 = np.array([4, 5, 6])

arr = np.concatenate((arr1, arr2))

print(arr)

[1 2 3 4 5 6]


## Split
array_split() is use for splitting arrays, we pass it the array we want to split and the number of splits

In [None]:
# Example
arr = np.array([1, 2, 3, 4, 5, 6])

new = np.array_split(arr, 2)

print(new)

[array([1, 2, 3]), array([4, 5, 6])]


## Sorting Arrays
NumPy ndarray object has a function called sort(), that will sort a specified array

In [None]:
# Example
arr = np.array([3, 2, 0, 1])

print(np.sort(arr))

[0 1 2 3]


## Generating a random integer
The random.randint() is use to generate random integers


In [None]:
#Example 
x = random.randint(50)

print(x)

23


## Normal Distribution
Use the random.normal() method to get a Normal Data Distribution
the method uses 3 parameters:
loc - (Mean) where the peak of the bell exists.
scale - (Standard Deviation) how flat the graph distribution should be.
size - The shape of the returned array.

In [None]:
#Example
x = random.normal(size=(2, 3))

print(x)

[[-0.30434931 -1.44478974  0.15642267]
 [ 0.9357505  -0.25837904  0.43983285]]


## Binomial Distribution
It describes the outcome of binary distribution
It has three parameters:
n - number of trials.
p - probability of occurence of each trial.
size - The shape of the returned array.

In [None]:
x = random.binomial(n=10, p=0.5, size=10)

print(x)

[6 2 5 4 7 5 6 5 7 3]


##Poisson Distribution
It estimates how many times an event can happen in a specified time
t has two parameters:
lam - rate or known number of occurences 
size - The shape of the returned array

In [None]:
# Example
x = random.poisson(lam=2, size=10)

print(x)

[0 0 0 4 2 1 2 1 1 3]


## numpy.linspace
The numpy.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0) function returns evenly spaced numbers over a specified interval defined by the first two arguments of the function

In [None]:
# Example
x = np.linspace(0,10,11)
print(x)

[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]


## numpy.argmax
The numpy.argmax(a, axis=None, out=None) function returns the indices of the maximum values along an axis.

In [None]:
# Example
array = np.array([[1,2,3],[4,5,6]])

max_pos = np.argmax(array)
print(max_pos)


5


## numpy.log()
The numpy.log() function returns an element-wise natural logarithm of an array

In [None]:
# Example 
n = np.array([[1,2.71828],[2.71828,1]])

print(n)
print(np.log(n))

[[1.      2.71828]
 [2.71828 1.     ]]
[[0.         0.99999933]
 [0.99999933 0.        ]]


## numpy.sum()
The numpy.sum() function sums the elements of an array over a given axis.

In [None]:
# Example
x = np.array([[1,2,3],[4,5,6]])

print(np.sum(x))



21


## numpy.mean()
numpy.mean(arr, axis = None) : Compute the arithmetic mean (average) of the given data (array elements) along the specified axis

In [None]:
# Example
arr = [20, 2, 7, 1, 34]
# 1 D array
print("arr : ", arr) 
print("mean of arr : ", np.mean(arr))

# 2 D array
arr = [[14, 17, 12, 33, 44],  
       [15, 6, 27, 8, 19], 
       [23, 2, 54, 1, 4, ]] 
    
# mean of the flattened array 
print("\nmean of arr, axis = None : ", np.mean(arr)) 
    
# mean along the axis = 0 
print("\nmean of arr, axis = 0 : ", np.mean(arr, axis = 0)) 
   
# mean along the axis = 1 
print("\nmean of arr, axis = 1 : ", np.mean(arr, axis = 1))

arr :  [20, 2, 7, 1, 34]
mean of arr :  12.8

mean of arr, axis = None :  18.6

mean of arr, axis = 0 :  [17.33333333  8.33333333 31.         14.         22.33333333]

mean of arr, axis = 1 :  [24.  15.  16.8]


## numpy.std()
numpy.std(arr, axis = None) : Compute the standard deviation of the given data (array elements) along the specified axis(if any)

In [None]:
# Example
arr = [20, 2, 7, 1, 34]
  
print("arr : ", arr) 
print("std of arr : ", np.std(arr))

arr :  [20, 2, 7, 1, 34]
std of arr :  12.576167937809991


## numpy.transpose()
The numpy.transpose() function reverses or permutes the axes of an ndarray.

In [None]:
# Example
x = np.array([[1,2,3],[4,5,6]])

print(x)
print(np.transpose(x))

[[1 2 3]
 [4 5 6]]
[[1 4]
 [2 5]
 [3 6]]


## Trigonometric Functions
NumPy provides the ufuncs sin(), cos() and tan() that take values in radians and produce the corresponding sin, cos and tan values.

In [None]:
# Example
arr = np.array([np.pi/2, np.pi/3, np.pi/4, np.pi/5])

print(np.sin(arr))
print(np.cos(arr))
print(np.tan(arr))


[1.         0.8660254  0.70710678 0.58778525]
[6.12323400e-17 5.00000000e-01 7.07106781e-01 8.09016994e-01]
[1.63312394e+16 1.73205081e+00 1.00000000e+00 7.26542528e-01]


## numpy.lcm()
The reduce() method will use the ufunc, in this case the lcm() function, on each element, and reduce the array by one dimension

In [None]:
# Example
arr = np.array([3, 6, 9])

x = np.lcm.reduce(arr)

print(x)

18


## Hyperbolic Functions
NumPy provides the ufuncs sinh(), cosh() and tanh() that take values in radians and produce the corresponding sinh, cosh and tanh values

In [None]:
# Example
arr = np.array([np.pi/2, np.pi/3, np.pi/4, np.pi/5])
print(np.cosh(arr))
print(np.sinh(arr))
print(np.tanh(arr))

[2.50917848 1.60028686 1.32460909 1.20397209]
[2.3012989  1.24936705 0.86867096 0.670484  ]
[0.91715234 0.78071444 0.6557942  0.55689331]


## numpy.unique()
 numpy.unique() method to find unique elements from any array

In [None]:
# Example
arr = np.array([1, 1, 1, 2, 3, 4, 5, 5, 6, 7])

x = np.unique(arr)

print(x)

[1 2 3 4 5 6 7]


## numpy.intersect1d()
finds the intersection of a 1D array

In [None]:
# Example
arr1 = np.array([1, 2, 3, 4])
arr2 = np.array([3, 4, 5, 6])

newarr = np.intersect1d(arr1, arr2, assume_unique=True)

print(newarr)

[3 4]


# BUILT-IN PANDAS METHODS


In [6]:
import pandas as pd

### Loading a file


1.   read_csv() is use to load a csv file to a pandas data frame
2.   read_excel() is use to load an excel file to a pandas data frame



In [47]:
# Example
df = pd.read_csv( 'https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae' ) 

  exec(code_obj, self.user_global_ns, self.user_ns)


### Head method
To see the data frame we can use df.head(). Head returns the five first rows

In [48]:
# Example
df.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


### Tail Method
To see the data frame we can use df.tail() returns the last five rows

In [49]:
# Example
df.tail()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
72181,World,2016,5001,BiocapTotGHA,3984702000.0,1504757000.0,5111762779.0,1095445000.0,472616300.0,0.0,12169280000.0,3A
72182,World,2016,5001,EFConsPerCap,0.5336445,0.1402092,0.273495,0.08974253,0.06329435,1.646235,2.746619,3A
72183,World,2016,5001,EFConsTotGHA,3984702000.0,1046937000.0,2042179333.0,670103900.0,472616300.0,12292370000.0,20508910000.0,3A
72184,World,2016,5001,EFProdPerCap,0.5336445,0.1402092,0.273495,0.08974253,0.06329435,1.646235,2.746619,3A
72185,World,2016,5001,EFProdTotGHA,3984702000.0,1046937000.0,2042179333.0,670103900.0,472616300.0,12292370000.0,20508910000.0,3A


### Shape
 df.shape, gives a total number of rows and columns of a data frame

In [7]:
# Example
df.shape

(72186, 12)

### Size
df.size returns the number of rows times number of columns in the data frame

In [13]:
# Example
df.size

866232

### Info
df.info() helps get different information such as rows from RangeIndex, Data columns and then data type of each column

In [11]:
# Example
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72186 entries, 0 to 72185
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         72186 non-null  object 
 1   year            72186 non-null  int64  
 2   country_code    72186 non-null  int64  
 3   record          72186 non-null  object 
 4   crop_land       51714 non-null  float64
 5   grazing_land    51714 non-null  float64
 6   forest_land     51714 non-null  object 
 7   fishing_ground  51713 non-null  float64
 8   built_up_land   51713 non-null  float64
 9   carbon          51713 non-null  float64
 10  total           72177 non-null  float64
 11  QScore          72185 non-null  object 
dtypes: float64(6), int64(2), object(4)
memory usage: 6.6+ MB


### isna
df.isna gives the total number of null values in a data, df.isna().sum() will give the total null values

In [12]:
# Example
df.isna().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

### dropna
df.dropna() remove all rows wit NULL values from the DataFrame.

In [14]:
# Example
df = df.dropna() 
df.isna().sum() 

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

### Describe

 df.describe() gives basic statistics of variables in a data frame. It will give you count, mean, standard deviation, and also 5 number summary

In [15]:
# Example
df.describe()

Unnamed: 0,year,country_code,crop_land,grazing_land,fishing_ground,built_up_land,carbon,total
count,51713.0,51713.0,51713.0,51713.0,51713.0,51713.0,51713.0,51713.0
mean,1990.516311,170.049717,19208090.0,13529010.0,10048600.0,1984737.0,30726090.0,101488400.0
std,16.055991,458.304383,168583500.0,129419300.0,108137500.0,17304880.0,398963000.0,876452600.0
min,1961.0,1.0,0.0,0.0,0.0,0.0,0.0,0.01876963
25%,1977.0,56.0,0.358108,0.1948843,0.08005409,0.03786298,0.0,1.881652
50%,1992.0,124.0,3.298635,9.984019,6.154094,0.2044437,0.0,47293.06
75%,2004.0,197.0,3610817.0,1734192.0,945000.0,386718.0,19.15736,20754270.0
max,2016.0,5001.0,3984702000.0,3417089000.0,2979605000.0,472616300.0,12571600000.0,20611820000.0


### Nunique

To get the total unique values of variables, we can use df.nunique(). It will give all the unique values a variable contains

In [16]:
# Example
df.nunique()

country             165
year                 56
country_code        165
record                8
crop_land         36868
grazing_land      40628
forest_land       49102
fishing_ground    45244
built_up_land     24774
carbon            25493
total             51585
QScore                3
dtype: int64

###  Value Counts
 count_values() returns counts of unique values in a data frame
 

In [19]:
# Example
df.carbon.value_counts()

0.000000         25863
28304.663680         3
238204.103700        3
204448.306500        3
23672.105420         3
                 ...  
1.194182             1
971752.972200        1
2.451266             1
463694.552000        1
2.590941             1
Name: carbon, Length: 25493, dtype: int64

###  Columns

To know the names of all the variables in a data frame, we can use df.columns

In [20]:
# Example
df.columns

Index(['country', 'year', 'country_code', 'record', 'crop_land',
       'grazing_land', 'forest_land', 'fishing_ground', 'built_up_land',
       'carbon', 'total', 'QScore'],
      dtype='object')

### memory_usage()
memory_usage() returns a Pandas Series having the memory usage of each column (in bytes) in a Pandas DataFrame

In [21]:
# Example
df.memory_usage()

Index             413704
country           413704
year              413704
country_code      413704
record            413704
crop_land         413704
grazing_land      413704
forest_land       413704
fishing_ground    413704
built_up_land     413704
carbon            413704
total             413704
QScore            413704
dtype: int64

### astype()
astype() is used to cast a Python object to a particular data type. It can be a very helpful function in case your data is not stored in the correct format (data type)

In [23]:
# Example
df['QScore'] = df.QScore.astype('category')

### loc[:]
loc[:] helps to access a group of rows and columns in a dataset, a slice of the dataset, as per our requirement

In [24]:
# Example
df.loc[0:4, ['country', 'year', 'country_code']]

Unnamed: 0,country,year,country_code
0,Armenia,1992,1
1,Armenia,1992,1
2,Armenia,1992,1
3,Armenia,1992,1
4,Armenia,1992,1


### to_datetime()
to_datetime() converts a Python object to datetime format. It can take an integer, floating point number, list, Pandas Series, or Pandas DataFrame as argument

In [25]:
# Example
df['carbon'] = pd.to_datetime(df['carbon'])

### drop_duplicates()
drop_duplicates() returns a Pandas DataFrame with duplicate rows removed. Even among duplicates, there is an option to keep the first occurrence (record) of the duplicate or the last

In [27]:
# Example
df.drop_duplicates(inplace=True)
print(df)

       country  year  country_code        record     crop_land  grazing_land  \
0      Armenia  1992             1    AreaPerCap  1.402924e-01  1.995463e-01   
1      Armenia  1992             1     AreaTotHA  4.830000e+05  6.870000e+05   
2      Armenia  1992             1  BiocapPerCap  1.598044e-01  1.352610e-01   
3      Armenia  1992             1  BiocapTotGHA  5.501762e+05  4.656780e+05   
4      Armenia  1992             1  EFConsPerCap  3.875102e-01  1.894622e-01   
...        ...   ...           ...           ...           ...           ...   
72181    World  2016          5001  BiocapTotGHA  3.984702e+09  1.504757e+09   
72182    World  2016          5001  EFConsPerCap  5.336445e-01  1.402092e-01   
72183    World  2016          5001  EFConsTotGHA  3.984702e+09  1.046937e+09   
72184    World  2016          5001  EFProdPerCap  5.336445e-01  1.402092e-01   
72185    World  2016          5001  EFProdTotGHA  3.984702e+09  1.046937e+09   

        forest_land  fishing_ground  bu

###  groupby()
groupby() is used to group a Pandas DataFrame by 1 or more columns, and perform some mathematical operation on it. groupby() can be used to summarize data in a simple manner

In [28]:
# Example
df.groupby(by='country').total.mean()

country
Afghanistan    9.987599e+06
Albania        1.914954e+06
Algeria        2.873060e+07
Angola         2.480199e+07
Argentina      9.668575e+07
                   ...     
Viet Nam       3.179592e+07
World          6.322370e+09
Yemen          7.738261e+06
Zambia         1.479009e+07
Zimbabwe       8.815689e+06
Name: total, Length: 165, dtype: float64

### sort_values()
sort_values() is used to sort column in a Pandas DataFrame (or a Pandas Series) by values in ascending or descending order. By specifying the inplace attribute as True, you can make a change directly in the original DataFrame.

In [29]:
# Example
df.sort_values(by='country_code', inplace=True)

### Loading a new file into pandas data frame

In [14]:
df1 = pd.read_csv( 'https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae' ) 
df1.isna().sum() # Checking NAs in the data frame

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

###  fillna()
Typically in a large dataset, you will find several entries labelled NaN by Python. NaN stands for “not a number”, and represents entries that were not populated in the original data source. While populating the values in the DataFrame, Pandas makes sure that these entries can be identified separately by the user.

fillna() helps to replace all NaN values in a DataFrame or Series by imputing these missing values with more appropriate values.

In [15]:
# Example
df1.fillna(0, inplace=True) 
df1.isna().sum()

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

### df.insert()
inserts a column in the specified position in a data frame

In [16]:
# Example
random_col = np.random.randint(100, size=len(df1))
df1.insert(3, 'random_col', random_col)
df1.head()

Unnamed: 0,country,year,country_code,random_col,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,21,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,91,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,21,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,16,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,15,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


### nlargest and nsmallest
This gives you the dataset with n number of largest values or smallest values of a specified column.

In [18]:
# Example
df1.nlargest(5, 'crop_land')

Unnamed: 0,country,year,country_code,random_col,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
72181,World,2016,5001,40,BiocapTotGHA,3984702000.0,1504757000.0,5111762779.0,1095445000.0,472616344.3,0.0,12169280000.0,3A
72183,World,2016,5001,64,EFConsTotGHA,3984702000.0,1046937000.0,2042179333.0,670103900.0,472616344.3,12292370000.0,20508910000.0,3A
72185,World,2016,5001,22,EFProdTotGHA,3984702000.0,1046937000.0,2042179333.0,670103900.0,472616344.3,12292370000.0,20508910000.0,3A
72173,World,2015,5001,21,BiocapTotGHA,3962163000.0,1501564000.0,5121114510.0,1099186000.0,464321398.7,0.0,12148350000.0,3A
72175,World,2015,5001,94,EFConsTotGHA,3962163000.0,1048846000.0,2001968737.0,669911700.0,464321398.7,12356910000.0,20504120000.0,3A


### df.rename()
It is used to rename columns

In [19]:
# Example
df1.rename(columns = {'random_col':'random_column'})

Unnamed: 0,country,year,country_code,random_column,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,21,AreaPerCap,1.402924e-01,1.995463e-01,0.097188051,3.688847e-02,2.931995e-02,0.000000e+00,5.032351e-01,3A
1,Armenia,1992,1,91,AreaTotHA,4.830000e+05,6.870000e+05,334600,1.270000e+05,1.009430e+05,0.000000e+00,1.732543e+06,3A
2,Armenia,1992,1,21,BiocapPerCap,1.598044e-01,1.352610e-01,0.084003213,1.374213e-02,3.339780e-02,0.000000e+00,4.262086e-01,3A
3,Armenia,1992,1,16,BiocapTotGHA,5.501762e+05,4.656780e+05,289207.1078,4.731155e+04,1.149823e+05,0.000000e+00,1.467355e+06,3A
4,Armenia,1992,1,15,EFConsPerCap,3.875102e-01,1.894622e-01,1.26E-06,4.164833e-03,3.339780e-02,1.114093e+00,1.728629e+00,3A
...,...,...,...,...,...,...,...,...,...,...,...,...,...
72181,World,2016,5001,40,BiocapTotGHA,3.984702e+09,1.504757e+09,5111762779.0,1.095445e+09,4.726163e+08,0.000000e+00,1.216928e+10,3A
72182,World,2016,5001,26,EFConsPerCap,5.336445e-01,1.402092e-01,0.273495,8.974253e-02,6.329435e-02,1.646235e+00,2.746619e+00,3A
72183,World,2016,5001,64,EFConsTotGHA,3.984702e+09,1.046937e+09,2042179333.0,6.701039e+08,4.726163e+08,1.229237e+10,2.050891e+10,3A
72184,World,2016,5001,36,EFProdPerCap,5.336445e-01,1.402092e-01,0.273495,8.974253e-02,6.329435e-02,1.646235e+00,2.746619e+00,3A
