# Numpy 

# 1-Accessing the array Index

In [1]:
# Python program to demonstrate
# indexing in numpy array
import numpy as np
 
# Initial Array
arr = np.array([[-1, 2, 0, 4],
                [4, -0.5, 6, 0],
                [2.6, 0, 7, 8],
                [3, -7, 4, 2.0]])
print("Initial Array: ")
print(arr)
 
# Printing a range of Array
# with the use of slicing method
sliced_arr = arr[:2, ::2]
print ("Array with first 2 rows and"
    " alternate columns(0 and 2):\n", sliced_arr)
 
# Printing elements at
# specific Indices
Index_arr = arr[[1, 1, 0, 3], 
                [3, 2, 1, 0]]
print ("\nElements at indices (1, 3), "
    "(1, 2), (0, 1), (3, 0):\n", Index_arr)

Initial Array: 
[[-1.   2.   0.   4. ]
 [ 4.  -0.5  6.   0. ]
 [ 2.6  0.   7.   8. ]
 [ 3.  -7.   4.   2. ]]
Array with first 2 rows and alternate columns(0 and 2):
 [[-1.  0.]
 [ 4.  6.]]

Elements at indices (1, 3), (1, 2), (0, 1), (3, 0):
 [0. 6. 2. 3.]


# 2-Basic Array Operations

In [2]:
# Python program to demonstrate
# basic operations on single array
import numpy as np
 
# Defining Array 1
a = np.array([[1, 2],
              [3, 4]])
 
# Defining Array 2
b = np.array([[4, 3],
              [2, 1]])
               
# Adding 1 to every element
print ("Adding 1 to every element:", a + 1)
 
# Subtracting 2 from each element
print ("\nSubtracting 2 from each element:", b - 2)
 
# sum of array elements
# Performing Unary operations
print ("\nSum of all array "
       "elements: ", a.sum())
 
# Adding two arrays
# Performing Binary operations
print ("\nArray sum:\n", a + b)

Adding 1 to every element: [[2 3]
 [4 5]]

Subtracting 2 from each element: [[ 2  1]
 [ 0 -1]]

Sum of all array elements:  10

Array sum:
 [[5 5]
 [5 5]]


# 3-Data Types in Numpy

In [3]:
# Python Program to create
# a data type object
import numpy as np
 
# Integer datatype
# guessed by Numpy
x = np.array([1, 2])  
print("Integer Datatype: ")
print(x.dtype)         
 
# Float datatype
# guessed by Numpy
x = np.array([1.0, 2.0]) 
print("\nFloat Datatype: ")
print(x.dtype)  
 
# Forced Datatype
x = np.array([1, 2], dtype = np.int64)   
print("\nForcing a Datatype: ")
print(x.dtype)

Integer Datatype: 
int32

Float Datatype: 
float64

Forcing a Datatype: 
int64


# 4- Math Operations on DataType array

In Numpy arrays, basic mathematical operations are performed element-wise on the array. These operations are applied both as operator overloads and as functions. Many useful functions are provided in Numpy for performing computations on Arrays such as sum: for addition of Array elements, T: for Transpose of elements, etc.

In [4]:
# Python Program to create
# a data type object
import numpy as np
 
# First Array
arr1 = np.array([[4, 7], [2, 6]], 
                 dtype = np.float64)

                  
# Second Array
arr2 = np.array([[3, 6], [2, 8]], 
                 dtype = np.float64) 
 
# Addition of two Arrays
Sum = np.add(arr1, arr2)
print("Addition of Two Arrays: ")
print(Sum)
 
# Addition of all Array elements
# using predefined sum method
Sum1 = np.sum(arr1)
print("\nAddition of Array elements: ")
print(Sum1)
 
# Square root of Array
Sqrt = np.sqrt(arr1)
print("\nSquare root of Array1 elements: ")
print(Sqrt)
 
# Transpose of Array
# using In-built function 'T'
Trans_arr = arr1.T
print("\nTranspose of Array: ")
print(Trans_arr)

Addition of Two Arrays: 
[[ 7. 13.]
 [ 4. 14.]]

Addition of Array elements: 
19.0

Square root of Array1 elements: 
[[2.         2.64575131]
 [1.41421356 2.44948974]]

Transpose of Array: 
[[4. 2.]
 [7. 6.]]


# Numpy | Iterating Over Array

In [5]:
# Python program for
# iterating over array
 
import numpy as geek
 
# creating an array using arrange 
# method
a = geek.arange(12)
 
# shape array with 3 rows and 
# 4 columns
a = a.reshape(3,4)
 
print('Original array is:')
print(a)
print()
 
print('Modified array is:')
 
# iterating  an array
for x in geek.nditer(a):
    print(x,end=" ")

Original array is:
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

Modified array is:
0 1 2 3 4 5 6 7 8 9 10 11 

In [6]:
# Python program for
# iterating over array
# using particular order
 
import numpy as geek 
 
# creating an array using arrange 
# method
a = geek.arange(0,60,5) 
 
# shape array with 3 rows and 
# 4 columns 
a = a.reshape(3,4) 
 
print('Original array is:') 
print(a)
print()  
 
print('Modified array in F-style order:')
 
# iterating an array in a given
# order   
for x in geek.nditer(a, order = 'F'): 
    print(x, end=" ")

Original array is:
[[ 0  5 10 15]
 [20 25 30 35]
 [40 45 50 55]]

Modified array in F-style order:
0 20 40 5 25 45 10 30 50 15 35 55 

In [7]:
# Python program for
# modifying array values
 
import numpy as geek
 
# creating an array using arrange 
# method
a = geek.arange(12)
 
# shape array with 3 rows and 
# 4 columns 
a = a.reshape(3,4)
print('Original array is:')
print(a)
print()
 
# modifying array values
for x in geek.nditer(a, op_flags = ['readwrite']):
    x[...] = 5*x
print('Modified array is:')
print(a)

Original array is:
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

Modified array is:
[[ 0  5 10 15]
 [20 25 30 35]
 [40 45 50 55]]


In [8]:
# Python program for
# iterating array values
# using external loop
 
import numpy as geek 
 
# creating an array using arrange 
# method
a = geek.arange(12) 
 
# shape array with 3 rows and 
# 4 columns 
a = a.reshape(3,4) 
 
print('Original array is:') 
print(a) 
print()  
 
print('Modified array is:') 
for x in geek.nditer(a, flags = ['external_loop'], order = 'C'):
    print(x)

Original array is:
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

Modified array is:
[ 0  1  2  3  4  5  6  7  8  9 10 11]


In [9]:
# Python program explaining
# bitwise_and() function
 
import numpy as geek
in_num1 = 10
in_num2 = 11
 
print ("Input  number1 : ", in_num1)
print ("Input  number2 : ", in_num2) 
   
out_num = geek.bitwise_and(in_num1, in_num2) 
print ("bitwise_and of 10 and 11 : ", out_num) 

Input  number1 :  10
Input  number2 :  11
bitwise_and of 10 and 11 :  10


In [10]:
import numpy as np 
print(np.char.join('-', 'geeksk'))

g-e-e-k-s-k


# Panda 
Pandas generally provide two data structure for manipulating data, They are:

    Series
    DataFrame 

# Series

Pandas Series is a one-dimensional labeled array capable of holding data of any type (integer, string, float, python objects, etc.). The axis labels are collectively called index. Pandas Series is nothing but a column in an excel sheet. Labels need not be unique but must be a hashable type. The object supports both integer and label-based indexing and provides a host of methods for performing operations involving the index

In [11]:
import pandas as pd 
import numpy as np 



# simple array 
data = np.array(['g', 'e', 'e', 'k', 's']) 
	
ser = pd.Series(data,dtype='object') 
print(ser) 


0    g
1    e
2    e
3    k
4    s
dtype: object


# DataFrame

Pandas DataFrame is two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns. Pandas DataFrame consists of three principal components, the data, rows, and columns.

In [12]:
# Python code demonstrate creating 
# DataFrame from dict narray / lists 
# By default addresses.
 
import pandas as pd
 
# intialise data of lists.
data = {'Name':['Tom', 'nick', 'krish', 'jack'],
        'Age':[20, 21, 19, 18]}
 
# Create DataFrame
df = pd.DataFrame(data)
 
# Print the output.
print(df)

    Name  Age
0    Tom   20
1   nick   21
2  krish   19
3   jack   18


In [13]:
# Import pandas package
import pandas as pd
 
# Define a dictionary containing employee data
data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32],
        'Address':['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'],
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']}
 
# Convert the dictionary into DataFrame 
df = pd.DataFrame(data)
 
# select two columns
print(df[['Name', 'Qualification']])

     Name Qualification
0     Jai           Msc
1  Princi            MA
2  Gaurav           MCA
3    Anuj           Phd


In [14]:
# importing pandas package
import pandas as pd
 
# making data frame from csv file
data = pd.read_csv("C:/Users/rzouga/Downloads/Github/MinicourseDatascience/nba.csv", index_col ="Name")
 
# retrieving row by loc method
first = data.loc["Avery Bradley"]
second = data.loc["R.J. Hunter"]
data.head(3)
#print(first, "\n\n\n", second)

Unnamed: 0_level_0,Team,Number,Position,Age,Height,Weight,College,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [15]:
# retrieving columns by indexing operator
first = data["Age"]
 
print(first)

Name
Avery Bradley    25.0
Jae Crowder      25.0
John Holland     27.0
R.J. Hunter      22.0
Jonas Jerebko    29.0
                 ... 
Shelvin Mack     26.0
Raul Neto        24.0
Tibor Pleiss     26.0
Jeff Withey      26.0
NaN               NaN
Name: Age, Length: 458, dtype: float64


# Indexing a DataFrame using .loc[ ] :
This function selects data by the label of the rows and columns. The df.loc indexer selects data in a different way than just the indexing operator. It can select subsets of rows or columns. It can also simultaneously select subsets of rows and columns. 

In [16]:
# retrieving row by loc method
first = data.loc["Avery Bradley"]
second = data.loc["R.J. Hunter"]
 
print(first, "\n\n\n", second)

Team        Boston Celtics
Number                   0
Position                PG
Age                     25
Height                 6-2
Weight                 180
College              Texas
Salary         7.73034e+06
Name: Avery Bradley, dtype: object 


 Team        Boston Celtics
Number                  28
Position                SG
Age                     22
Height                 6-5
Weight                 185
College      Georgia State
Salary         1.14864e+06
Name: R.J. Hunter, dtype: object


# Indexing a DataFrame using .iloc[ ] :
This function allows us to retrieve rows and columns by position. In order to do that, we’ll need to specify the positions of the rows that we want, and the positions of the columns that we want as well. The df.iloc indexer is very similar to df.loc but only uses integer locations to make its selections.

In [17]:
# retrieving rows by iloc method 
row2 = data.iloc[3] 
print(row2)

Team        Boston Celtics
Number                  28
Position                SG
Age                     22
Height                 6-5
Weight                 185
College      Georgia State
Salary         1.14864e+06
Name: R.J. Hunter, dtype: object


# Checking for missing values using isnull() and notnull() :
In order to check missing values in Pandas DataFrame, we use a function isnull() and notnull(). Both function help in checking whether a value is NaN or not. These function can also be used in Pandas Series in order to find null values in a series.

In [18]:
data.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [19]:
# importing pandas as pd
import pandas as pd
 
# importing numpy as np
import numpy as np
 
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}
 
# creating a dataframe from list
df = pd.DataFrame(dict)
 
# using isnull() function  
df.isnull()

Unnamed: 0,First Score,Second Score,Third Score
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


# Filling missing values using fillna(), replace() and interpolate() :
In order to fill null values in a datasets, we use fillna(), replace() and interpolate() function these function replace NaN values with some value of their own. All these function help in filling a null values in datasets of a DataFrame. Interpolate() function is basically used to fill NA values in the dataframe but it uses various interpolation technique to fill the missing values rather than hard-coding the value.

In [20]:
# importing pandas as pd
import pandas as pd
 
# importing numpy as np
import numpy as np
 
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}
 
# creating a dataframe from dictionary
df = pd.DataFrame(dict)
 
# filling missing value using fillna()  
df.fillna(0)

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.0
1,90.0,45.0,40.0
2,0.0,56.0,80.0
3,95.0,0.0,98.0


# Dropping missing values using dropna() :
In order to drop a null values from a dataframe, we used dropna() function this fuction drop Rows/Columns of datasets with Null values in different ways.

In [21]:
df.dropna()

Unnamed: 0,First Score,Second Score,Third Score
1,90.0,45.0,40.0


# Iterating over rows :
In order to iterate over rows, we can use three function iteritems(), iterrows(), itertuples() . These three function will help in iteration over rows.

**iterrows()**: Iterate over the rows of a DataFrame as (index, Series) pairs....

      for index, row in df.iterrows():
     
          print(row['c1'], row['c2'])

**itertuples()**: Iterate over the rows of a DataFrame as tuples of the values. This is a lot faster as iterrows(), and is in most cases preferable to use to iterate over the values of a DataFrame.

     for row in df.itertuples(index=True, name='Pandas'):
          print(row.c1, row.c2)
 



    How to iterate over rows in a DataFrame in Pandas?

**Answer: DON'T*!**

Iteration in Pandas is an anti-pattern and is something you should only do when you have exhausted every other option. You should not use any function with "iter" in its name for more than a few thousand rows or you will have to get used to a lot of waiting.

Do you want to print a DataFrame? Use DataFrame.to_string().

Do you want to compute something? In that case, search for methods in this order (list modified from here):

    Vectorization
    Cython routines
    List Comprehensions (vanilla for loop)
    DataFrame.apply(): i)  Reductions that can be performed in Cython, ii) Iteration in Python space
    DataFrame.itertuples() and iteritems()
    DataFrame.iterrows()
    
Next Best Thing: List Comprehensions*

List comprehensions should be your next port of call if 1) there is no vectorized solution available, 2) performance is important, but not important enough to go through the hassle of cythonizing your code, and 3) you're trying to perform elementwise transformation on your code. There is a good amount of evidence to suggest that list comprehensions are sufficiently fast (and even sometimes faster) for many common Pandas tasks.

The formula is simple,

### Iterating over one column - `f` is some function that processes your data
result = [f(x) for x in df['col']]
### Iterating over two columns, use `zip`
result = [f(x, y) for x, y in zip(df['col1'], df['col2'])]
### Iterating over multiple columns - `same data type`
result = [f(row[0], ..., row[n]) for row in df[['col1', ...,'coln']].to_numpy()]
###  Iterating over multiple columns - `differing data type`
result = [f(row[0], ..., row[n]) for row in zip(df['col1'], ..., df['coln'])]


In [30]:
df.to_string()

'    Name  Age\n0    tom   25\n1  krish   30\n2   nick   26\n3   juli   22'

In [22]:
# importing pandas as pd
import pandas as pd
  
# dictionary of lists
dict = {'name':["aparna", "pankaj", "sudhir", "Geeku"],
        'degree': ["MBA", "BCA", "M.Tech", "MBA"],
        'score':[90, 40, 80, 98]}
 
# creating a dataframe from a dictionary 
df = pd.DataFrame(dict)
 
# iterating over rows using iterrows() function 
for i, j in df.iterrows():
    print(i, j)
    print()

0 name      aparna
degree       MBA
score         90
Name: 0, dtype: object

1 name      pankaj
degree       BCA
score         40
Name: 1, dtype: object

2 name      sudhir
degree    M.Tech
score         80
Name: 2, dtype: object

3 name      Geeku
degree      MBA
score        98
Name: 3, dtype: object



In [23]:
df.columns 
# creating a list of dataframe columns
columns = list(df.columns) 
 
for i in columns:
 
    # printing the third element of the column
    print (df[i][2])

sudhir
M.Tech
80


# Creating a Pandas DataFrame

In [24]:
# Python program to demonstrate creating  
# pandas Datadaframe from lists using zip.  
    
import pandas as pd  
    
# List1  
Name = ['tom', 'krish', 'nick', 'juli']  
    
# List2  
Age = [25, 30, 26, 22]  
    
# get the list of tuples from two lists.  
# and merge them by using zip().  
list_of_tuples = list(zip(Name, Age))  
    
# Assign data to tuples.  
list_of_tuples   
  
#Converting lists of tuples into  
# pandas Dataframe.  
df = pd.DataFrame(list_of_tuples, columns = ['Name', 'Age'])  
     
# Print data.  
df

Unnamed: 0,Name,Age
0,tom,25
1,krish,30
2,nick,26
3,juli,22


In [37]:
# making data frame from csv file
data = pd.read_csv("C:/Users/rzouga/Downloads/Github/MinicourseDatascience/nba.csv")
 
# number of rows to return 
n = 9
  
# creating series 
series = data["Name"] 
  
# returning top n rows 
top = series.head(n = n) 
  
# display 
top 

0    Avery Bradley
1      Jae Crowder
2     John Holland
3      R.J. Hunter
4    Jonas Jerebko
5     Amir Johnson
6    Jordan Mickey
7     Kelly Olynyk
8     Terry Rozier
Name: Name, dtype: object

In [26]:
data.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [27]:
# calling describe method 
desc = data["Name"].describe() 
  
# display 
desc 

count           457
unique          457
top       Jon Leuer
freq              1
Name: Name, dtype: object

In [28]:
# removing null values to avoid errors  
data.dropna(inplace = True)  
  
# percentile list 
perc =[.20, .40, .60, .80] 
  
# list of dtypes to include 
include =['object', 'float', 'int'] 
  
# calling describe method 
desc = data.describe(percentiles = perc, include = include) 
# display 
desc 

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
count,364,364,364.0,364,364.0,364,364.0,364,364.0
unique,364,30,,5,,17,,115,
top,Darren Collison,New Orleans Pelicans,,SG,,6-9,,Kentucky,
freq,1,16,,87,,49,,22,
mean,,,16.82967,,26.615385,,219.785714,,4620311.0
std,,,14.994162,,4.233591,,24.793099,,5119716.0
min,,,0.0,,19.0,,161.0,,55722.0
20%,,,4.0,,23.0,,195.0,,947276.0
40%,,,9.0,,25.0,,212.0,,1638754.0
50%,,,12.0,,26.0,,220.0,,2515440.0


In [32]:
# Import pandas package 
import pandas as pd 
  
# Define a dictionary containing employee data 
data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']} 
  
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data) 
# select two columns 
print(df[['Name', 'Qualification']]) 

     Name Qualification
0     Jai           Msc
1  Princi            MA
2  Gaurav           MCA
3    Anuj           Phd


In [33]:
# Declare a list that is to be converted into a column 
address = ['Delhi', 'Bangalore', 'Chennai', 'Patna'] 
  
# Using 'Address' as the column name 
# and equating it to the list 
df['Address'] = address 
  
# Observe the result 
print(df) 

     Name  Age    Address Qualification
0     Jai   27      Delhi           Msc
1  Princi   24  Bangalore            MA
2  Gaurav   22    Chennai           MCA
3    Anuj   32      Patna           Phd


In [38]:
# dropping passed columns 
data.drop(["Team", "Weight"], axis = 1, inplace = True) 
  
# display 
print(data) 

              Name  Number Position   Age Height            College     Salary
0    Avery Bradley     0.0       PG  25.0    6-2              Texas  7730337.0
1      Jae Crowder    99.0       SF  25.0    6-6          Marquette  6796117.0
2     John Holland    30.0       SG  27.0    6-5  Boston University        NaN
3      R.J. Hunter    28.0       SG  22.0    6-5      Georgia State  1148640.0
4    Jonas Jerebko     8.0       PF  29.0   6-10                NaN  5000000.0
..             ...     ...      ...   ...    ...                ...        ...
453   Shelvin Mack     8.0       PG  26.0    6-3             Butler  2433333.0
454      Raul Neto    25.0       PG  24.0    6-1                NaN   900000.0
455   Tibor Pleiss    21.0        C  26.0    7-3                NaN  2900000.0
456    Jeff Withey    24.0        C  26.0    7-0             Kansas   947276.0
457            NaN     NaN      NaN   NaN    NaN                NaN        NaN

[458 rows x 7 columns]


In [39]:
new_row = pd.DataFrame({'Name':'Geeks', 'Team':'Boston', 'Number':3, 
                        'Position':'PG', 'Age':33, 'Height':'6-2', 
                        'Weight':189, 'College':'MIT', 'Salary':99999}, 
                                                            index =[0]) 
# simply concatenate both dataframes 
df = pd.concat([new_row, data]).reset_index(drop = True) 
df.head(5) 

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Geeks,Boston,3.0,PG,33.0,6-2,189.0,MIT,99999.0
1,Avery Bradley,,0.0,PG,25.0,6-2,,Texas,7730337.0
2,Jae Crowder,,99.0,SF,25.0,6-6,,Marquette,6796117.0
3,John Holland,,30.0,SG,27.0,6-5,,Boston University,
4,R.J. Hunter,,28.0,SG,22.0,6-5,,Georgia State,1148640.0


In [41]:
# making data frame from csv file
data = pd.read_csv("C:/Users/rzouga/Downloads/Github/MinicourseDatascience/nba.csv", index_col='Name')
# dropping passed values 
data.drop(["Avery Bradley", "John Holland", "R.J. Hunter", 
                            "R.J. Hunter"], inplace = True) 
  
# display 
data 

Unnamed: 0_level_0,Team,Number,Position,Age,Height,Weight,College,Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...
Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


# Pandas GroupBy


**Groupby** is a pretty simple concept. We can create a grouping of categories and apply a function to the categories. It’s a simple concept but it’s an extremely valuable technique that’s widely used in data science. In real data science projects, you’ll be dealing with large amounts of data and trying things over and over, so for efficiency, we use Groupby concept. Groupby concept is really important because it’s ability to aggregate data efficiently, both in performance and the amount code is magnificent. Groupby mainly refers to a process involving one or more of the following steps they are:

    Splitting : It is a process in which we split data into group by applying some conditions on datasets.
    Applying : It is a process in which we apply a function to each group independently
    Combining : It is a process in which we combine different datasets after applying groupby and results into a data structure


In [42]:

# importing pandas module 
import pandas as pd  
   
# Define a dictionary containing employee data  
data1 = {'Name':['Jai', 'Anuj', 'Jai', 'Princi',  
                 'Gaurav', 'Anuj', 'Princi', 'Abhi'],  
        'Age':[27, 24, 22, 32,  
               33, 36, 27, 32],  
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj', 
                   'Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'],  
        'Qualification':['Msc', 'MA', 'MCA', 'Phd', 
                         'B.Tech', 'B.com', 'Msc', 'MA']}  
     # Convert the dictionary into DataFrame   
df = pd.DataFrame(data1) 
   
print(df)  


     Name  Age    Address Qualification
0     Jai   27     Nagpur           Msc
1    Anuj   24     Kanpur            MA
2     Jai   22  Allahabad           MCA
3  Princi   32    Kannuaj           Phd
4  Gaurav   33    Jaunpur        B.Tech
5    Anuj   36     Kanpur         B.com
6  Princi   27  Allahabad           Msc
7    Abhi   32    Aligarh            MA


In [43]:

# using groupby function 
# with one key 
  
df.groupby('Name') 
print(df.groupby('Name').groups) 


{'Abhi': Int64Index([7], dtype='int64'), 'Anuj': Int64Index([1, 5], dtype='int64'), 'Gaurav': Int64Index([4], dtype='int64'), 'Jai': Int64Index([0, 2], dtype='int64'), 'Princi': Int64Index([3, 6], dtype='int64')}


In [54]:
# applying groupby() function to 
# group the data on Name value. 
gk = df.groupby('Name') 
# Let's print the first entries 
# in all the groups formed. 
gk.first() 

Unnamed: 0_level_0,Age,Address,Qualification
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abhi,32,Aligarh,MA
Anuj,24,Kanpur,MA
Gaurav,33,Jaunpur,B.Tech
Jai,27,Nagpur,Msc
Princi,32,Kannuaj,Phd


In [55]:
# importing pandas module 
import pandas as pd 
# Define a dictionary containing employee data 
data1 = {'Name':['Jai', 'Anuj', 'Jai', 'Princi', 
				'Gaurav', 'Anuj', 'Princi', 'Abhi'], 
		'Age':[27, 24, 22, 32, 
			33, 36, 27, 32], 
		'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj', 
				'Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'], 
		'Qualification':['Msc', 'MA', 'MCA', 'Phd', 
						'B.Tech', 'B.com', 'Msc', 'MA']} 

# Convert the dictionary into DataFrame 
df = pd.DataFrame(data1) 

# Using multiple keys in 
# groupby() function 
df.groupby(['Name', 'Qualification']) 

print(df.groupby(['Name', 'Qualification']).groups) 
 


{('Abhi', 'MA'): Int64Index([7], dtype='int64'), ('Anuj', 'B.com'): Int64Index([5], dtype='int64'), ('Anuj', 'MA'): Int64Index([1], dtype='int64'), ('Gaurav', 'B.Tech'): Int64Index([4], dtype='int64'), ('Jai', 'MCA'): Int64Index([2], dtype='int64'), ('Jai', 'Msc'): Int64Index([0], dtype='int64'), ('Princi', 'Msc'): Int64Index([6], dtype='int64'), ('Princi', 'Phd'): Int64Index([3], dtype='int64')}


# Applying function to group

After splitting a data into a group, we apply a function to each group in order to do that we perform some operation they are:

    **Aggregation : It is a process in which we compute a summary statistic (or statistics) about each group. For Example, Compute group sums ormeans
    
    **Transformation : It is a process in which we perform some group-specific computations and return a like-indexed. For Example, Filling NAs within groups with a value derived from each group
    
    **Filtration : It is a process in which we discard some groups, according to a group-wise computation that evaluates True or False. For Example, Filtering out data based on the group sum or mean

 

In [58]:
# importing pandas module 
import pandas as pd  
   
# Define a dictionary containing employee data  
data1 = {'Name':['Jai', 'Anuj', 'Jai', 'Princi',  
                 'Gaurav', 'Anuj', 'Princi', 'Abhi'],  
        'Age':[27, 24, 22, 32,  
               33, 36, 27, 32],}
# using groupby function 
# without using sort 

df.groupby(['Name']).sum() 


Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Abhi,32
Anuj,60
Gaurav,33
Jai,49
Princi,59


In [59]:
# Now we apply groupby() using sort in order to attain potential speedups
# using groupby function 
# with sort 

df.groupby(['Name'], sort = False).sum() 


Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Jai,49
Anuj,60
Princi,59
Gaurav,33
Abhi,32


In [60]:
# iterating an element 
# of group 

grp = df.groupby('Name') 
for name, group in grp: 
	print(name) 
	print(group) 
	print() 


Abhi
   Name  Age  Address Qualification
7  Abhi   32  Aligarh            MA

Anuj
   Name  Age Address Qualification
1  Anuj   24  Kanpur            MA
5  Anuj   36  Kanpur         B.com

Gaurav
     Name  Age  Address Qualification
4  Gaurav   33  Jaunpur        B.Tech

Jai
  Name  Age    Address Qualification
0  Jai   27     Nagpur           Msc
2  Jai   22  Allahabad           MCA

Princi
     Name  Age    Address Qualification
3  Princi   32    Kannuaj           Phd
6  Princi   27  Allahabad           Msc



In [61]:
# iterating an element 
# of group containing 
# multiple keys 

grp = df.groupby(['Name', 'Qualification']) 
for name, group in grp: 
	print(name) 
	print(group) 
	print() 


('Abhi', 'MA')
   Name  Age  Address Qualification
7  Abhi   32  Aligarh            MA

('Anuj', 'B.com')
   Name  Age Address Qualification
5  Anuj   36  Kanpur         B.com

('Anuj', 'MA')
   Name  Age Address Qualification
1  Anuj   24  Kanpur            MA

('Gaurav', 'B.Tech')
     Name  Age  Address Qualification
4  Gaurav   33  Jaunpur        B.Tech

('Jai', 'MCA')
  Name  Age    Address Qualification
2  Jai   22  Allahabad           MCA

('Jai', 'Msc')
  Name  Age Address Qualification
0  Jai   27  Nagpur           Msc

('Princi', 'Msc')
     Name  Age    Address Qualification
6  Princi   27  Allahabad           Msc

('Princi', 'Phd')
     Name  Age  Address Qualification
3  Princi   32  Kannuaj           Phd



In [62]:
# selecting a single group 

grp = df.groupby('Name') 
grp.get_group('Jai') 


Unnamed: 0,Name,Age,Address,Qualification
0,Jai,27,Nagpur,Msc
2,Jai,22,Allahabad,MCA


In [63]:
# selecting object grouped 
# on multiple columns 

grp = df.groupby(['Name', 'Qualification']) 
grp.get_group(('Jai', 'Msc')) 


Unnamed: 0,Name,Age,Address,Qualification
0,Jai,27,Nagpur,Msc


In [64]:
# performing aggregation using 
# aggregate method 

grp1 = df.groupby('Name') 

grp1.aggregate(np.sum) 


Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Abhi,32
Anuj,60
Gaurav,33
Jai,49
Princi,59


In [72]:
df.groupby('Name').sum()

Unnamed: 0_level_0,Age,Score
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Abhi,32,53
Anuj,60,84
Gaurav,33,47
Jai,49,58
Princi,59,97


In [65]:
# performing aggregation on 
# group containing multiple 
# keys 
grp1 = df.groupby(['Name', 'Qualification']) 

grp1.aggregate(np.sum) 


Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Name,Qualification,Unnamed: 2_level_1
Abhi,MA,32
Anuj,B.com,36
Anuj,MA,24
Gaurav,B.Tech,33
Jai,MCA,22
Jai,Msc,27
Princi,Msc,27
Princi,Phd,32


In [66]:
# applying a function by passing 
# a list of functions 

grp = df.groupby('Name') 

grp['Age'].agg([np.sum, np.mean, np.std]) 


Unnamed: 0_level_0,sum,mean,std
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abhi,32,32.0,
Anuj,60,30.0,8.485281
Gaurav,33,33.0,
Jai,49,24.5,3.535534
Princi,59,29.5,3.535534


In [67]:
df.groupby('Name')['Age'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abhi,32,32.0,
Anuj,60,30.0,8.485281
Gaurav,33,33.0,
Jai,49,24.5,3.535534
Princi,59,29.5,3.535534


In [70]:
# Define a dictionary containing employee data  
data1 = {'Name':['Jai', 'Anuj', 'Jai', 'Princi',  
                 'Gaurav', 'Anuj', 'Princi', 'Abhi'],  
        'Age':[27, 24, 22, 32,  
               33, 36, 27, 32],  
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj', 
                   'Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'],  
        'Qualification':['Msc', 'MA', 'MCA', 'Phd', 
                         'B.Tech', 'B.com', 'Msc', 'MA'], 
        'Score': [23, 34, 35, 45, 47, 50, 52, 53]}  
     
# Convert the dictionary into DataFrame   
df = pd.DataFrame(data1) 
df.groupby('Name')['Age','Score'].agg([np.sum, np.mean, np.std])

  


Unnamed: 0_level_0,Age,Age,Age,Score,Score,Score
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Abhi,32,32.0,,53,53.0,
Anuj,60,30.0,8.485281,84,42.0,11.313708
Gaurav,33,33.0,,47,47.0,
Jai,49,24.5,3.535534,58,29.0,8.485281
Princi,59,29.5,3.535534,97,48.5,4.949747


In [71]:

# using different aggregation 
# function by passing dictionary 
# to aggregate 
grp = df.groupby('Name') 
  
grp.agg({'Age' : 'sum', 'Score' : 'std'}) 


Unnamed: 0_level_0,Age,Score
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Abhi,32,
Anuj,60,11.313708
Gaurav,33,
Jai,49,8.485281
Princi,59,4.949747


In [73]:
# using transform function 
grp = df.groupby('Name') 
sc = lambda x: (x - x.mean()) / x.std()*10
grp.transform(sc) 


Unnamed: 0,Age,Score
0,7.071068,-7.071068
1,-7.071068,-7.071068
2,-7.071068,7.071068
3,7.071068,-7.071068
4,,
5,7.071068,7.071068
6,-7.071068,7.071068
7,,


In [74]:
# filtering data using 
# filter data 
grp = df.groupby('Name') 
grp.filter(lambda x: len(x) >= 2) 


Unnamed: 0,Name,Age,Address,Qualification,Score
0,Jai,27,Nagpur,Msc,23
1,Anuj,24,Kanpur,MA,34
2,Jai,22,Allahabad,MCA,35
3,Princi,32,Kannuaj,Phd,45
5,Anuj,36,Kanpur,B.com,50
6,Princi,27,Allahabad,Msc,52


# Example Groupby , complete exercice  :


In [84]:
# making data frame from csv file
data = pd.read_csv("C:/Users/rzouga/Downloads/Github/MinicourseDatascience/nba.csv")
# applying groupby() function to 
# group the data on team value. 
gk = data.groupby('Team')

# Let's print the first entries 
# in all the groups formed. 
gk.first().head(3) 


Unnamed: 0_level_0,Name,Number,Position,Age,Height,Weight,College,Salary
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Atlanta Hawks,Kent Bazemore,24.0,SF,26.0,6-5,201.0,Old Dominion,2000000.0
Boston Celtics,Avery Bradley,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
Brooklyn Nets,Bojan Bogdanovic,44.0,SG,27.0,6-8,216.0,Oklahoma State,3425510.0


In [80]:
# Finding the values contained in the "Boston Celtics" group 
gk.get_group('Boston Celtics').head(3)

Unnamed: 0,Name,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,30.0,SG,27.0,6-5,205.0,Boston University,


In [87]:
# First grouping based on "Team" 
# Within each team we are grouping based on "Position" 
gkk = data.groupby(['Team', 'Position']) 

# Print the first value in each group 
gkk.first() 


Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Number,Age,Height,Weight,College,Salary
Team,Position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Atlanta Hawks,C,Al Horford,15.0,30.0,6-10,245.0,Florida,12000000.0
Atlanta Hawks,PF,Kris Humphries,43.0,31.0,6-9,235.0,Minnesota,1000000.0
Atlanta Hawks,PG,Dennis Schroder,17.0,22.0,6-1,172.0,Wake Forest,1763400.0
Atlanta Hawks,SF,Kent Bazemore,24.0,26.0,6-5,201.0,Old Dominion,2000000.0
Atlanta Hawks,SG,Tim Hardaway Jr.,10.0,24.0,6-6,205.0,Michigan,1304520.0
...,...,...,...,...,...,...,...,...
Washington Wizards,C,Marcin Gortat,13.0,32.0,6-11,240.0,North Carolina State,11217391.0
Washington Wizards,PF,Drew Gooden,90.0,34.0,6-10,250.0,Kansas,3300000.0
Washington Wizards,PG,Ramon Sessions,7.0,30.0,6-3,190.0,Nevada,2170465.0
Washington Wizards,SF,Jared Dudley,1.0,30.0,6-7,225.0,Boston College,4375000.0


In [1]:
import pandas as pd 
# Create date and time with dataframe 

rng = pd.DataFrame() 
rng['date'] = pd.date_range('1/1/2011', periods = 72, freq ='H') 

# Print the dates in dd-mm-yy format 
rng[:5] 

# Create features for year, month, day, hour, and minute 
rng['year'] = rng['date'].dt.year 
rng['month'] = rng['date'].dt.month 
rng['day'] = rng['date'].dt.day 
rng['hour'] = rng['date'].dt.hour 
rng['minute'] = rng['date'].dt.minute 

# Print the dates divided into features 
rng.head(3) 


Unnamed: 0,date,year,month,day,hour,minute
0,2011-01-01 00:00:00,2011,1,1,0,0
1,2011-01-01 01:00:00,2011,1,1,1,0
2,2011-01-01 02:00:00,2011,1,1,2,0


In [4]:
# Input present datetime using Timestamp 
t = pd.Timestamp.now() 
t 


Timestamp('2020-10-23 10:55:06.743577')

In [6]:
# Convert timestamp to datetime 
pd.to_datetime(t) 


Timestamp('2020-10-23 10:55:06.743577')

In [10]:
# Directly access and print the features 
t.year 
t.hour 
t.month 
t.day 
t.hour 
t.minute 
t.second 


6

In [11]:
import pandas as pd 

url = 'http://bit.ly/uforeports'

# read csv file 
df = pd.read_csv(url)		 
df.head() 


Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [12]:
# Convert the Time column to datatime format 
df['Time'] = pd.to_datetime(df.Time) 

df.head() 


Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,1930-06-01 22:00:00
1,Willingboro,,OTHER,NJ,1930-06-30 20:00:00
2,Holyoke,,OVAL,CO,1931-02-15 14:00:00
3,Abilene,,DISK,KS,1931-06-01 13:00:00
4,New York Worlds Fair,,LIGHT,NY,1933-04-18 19:00:00


In [13]:
# shows the type of each column data 
df.dtypes 


City                       object
Colors Reported            object
Shape Reported             object
State                      object
Time               datetime64[ns]
dtype: object

In [15]:
# Get hour detail from time data 
df.Time.dt.hour.head() 


0    22
1    20
2    14
3    13
4    19
Name: Time, dtype: int64

In [18]:
# Get name of each date 
df.Time.dt.day_name().head() 

0     Sunday
1     Monday
2     Sunday
3     Monday
4    Tuesday
Name: Time, dtype: object

In [19]:
# Get ordinal day of the year 
df.Time.dt.dayofyear.head() 


0    152
1    181
2     46
3    152
4    108
Name: Time, dtype: int64

# Python | Pandas Working With Text Data

In [20]:
# Import pandas package 
import pandas as pd 
   
# Define a dictionary containing employee data 
data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']} 
   
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data) 
   
# converting and overwriting values in column 
df["Name"]= df["Name"].str.lower()
 
print(df)

     Name  Age    Address Qualification
0     jai   27      Delhi           Msc
1  princi   24     Kanpur            MA
2  gaurav   22  Allahabad           MCA
3    anuj   32    Kannauj           Phd


In [27]:
# making data frame from csv file
data = pd.read_csv("C:/Users/rzouga/Downloads/Github/MinicourseDatascience/nba.csv")

# converting and overwriting values in column 
data["Team"]= data["Team"].str.upper() 
   
# display 
data.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,BOSTON CELTICS,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,BOSTON CELTICS,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,BOSTON CELTICS,30.0,SG,27.0,6-5,205.0,Boston University,


In [26]:
# importing pandas module  
import pandas as pd 
     
# Define a dictionary containing employee data 
data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Knnuaj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']} 
 
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data) 
    
# dropping null value columns to avoid errors 
df.dropna(inplace = True) 
    
# new data frame with split value columns 
df["Address"]= df["Address"].str.split("a", n = 1, expand = True) 
   
# df display 
print(df)

     Name  Age Address Qualification
0     Jai   27       N           Msc
1  Princi   24       K            MA
2  Gaurav   22     All           MCA
3    Anuj   32    Knnu           Phd


In [31]:
# importing pandas module 
import pandas as pd
 
# overwriting column with replaced value of age
data["Age"]= data["Age"].replace(25.0, "Twenty five")
 
# creating a filter for age column 
# where age = "Twenty five"
filter = data["Age"]=="Twenty five"
 
# printing only filtered columns
data.where(filter).dropna().head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,BOSTON CELTICS,0.0,PG,Twenty five,6-2,180.0,Texas,7730337.0
1,Jae Crowder,BOSTON CELTICS,99.0,SF,Twenty five,6-6,235.0,Marquette,6796117.0
7,Kelly Olynyk,BOSTON CELTICS,41.0,C,Twenty five,7-0,238.0,Gonzaga,2165160.0


In [32]:
# importing pandas module 
import pandas as pd 
   
# Define a dictionary containing employee data 
data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']} 
# Convert the dictionary into DataFrame  
df = pd.DataFrame(data) 
# making copy of address column 
new = df["Address"].copy() 
   
# concatenating address with name column 
# overwriting name column 
df["Name"]= df["Name"].str.cat(new, sep =", ") 
   
# display 
print(df)

                Name  Age    Address Qualification
0        Jai, Nagpur   27     Nagpur           Msc
1     Princi, Kanpur   24     Kanpur            MA
2  Gaurav, Allahabad   22  Allahabad           MCA
3      Anuj, Kannuaj   32    Kannuaj           Phd


In [33]:
# importing pandas module 
import pandas as pd 
 
# creating a series 
s = pd.Series(['a1', 'b2', 'c3'])
 
# Extracting a data
n= s.str.extract(r'([ab])(\d)')
 
print(n)

     0    1
0    a    1
1    b    2
2  NaN  NaN


In [34]:
# Extracting a data
n = s.str.extract(r'(?P<Geeks>[ab])(?P<For>\d)')
 
print(n)

  Geeks  For
0     a    1
1     b    2
2   NaN  NaN
