In [1]:
import csv
import numpy as np

In [16]:
#read the csv file. It is delimited by ',' and the data is stored in a list of lists
with open('winequality-red.csv','r') as f:
    wines = list(csv.reader(f,delimiter = ','))

In [17]:
type(wines)

list

In [18]:
wines[:3]

[['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'],
 ['7.4',
  '0.7',
  '0',
  '1.9',
  '0.076',
  '11',
  '34',
  '0.9978',
  '3.51',
  '0.56',
  '9.4',
  '5'],
 ['7.8',
  '0.88',
  '0',
  '2.6',
  '0.098',
  '25',
  '67',
  '0.9968',
  '3.2',
  '0.68',
  '9.8',
  '5']]

In [21]:
#calculate average quality of wines
#extract the last column for each row -> convert to float and save these into a list -> find the average
qualities = [float(item[-1]) for item in wines[1:]]
average_quality = sum(qualities)/len(qualities)
print("Average quality of wines =",average_quality)

Average quality of wines = 5.6360225140712945


With NumPy, we work with multidimensional arrays.  A 2-dimensional array is also known as a matrix, and is something you should be familiar with. In fact, it’s just a different way of thinking about a list of lists. A matrix has rows and columns. By specifying a row number and a column number, we’re able to extract an element from a matrix. 

In [24]:
#Pass the list of lists wines into the array function, which converts it into a NumPy array.Exclude the header row with
#list slicing -> Convert the data type to float
wines = np.array(wines[1:],dtype=np.float)
wines

array([[  7.4  ,   0.7  ,   0.   , ...,   0.56 ,   9.4  ,   5.   ],
       [  7.8  ,   0.88 ,   0.   , ...,   0.68 ,   9.8  ,   5.   ],
       [  7.8  ,   0.76 ,   0.04 , ...,   0.65 ,   9.8  ,   5.   ],
       ..., 
       [  6.3  ,   0.51 ,   0.13 , ...,   0.75 ,  11.   ,   6.   ],
       [  5.9  ,   0.645,   0.12 , ...,   0.71 ,  10.2  ,   5.   ],
       [  6.   ,   0.31 ,   0.47 , ...,   0.66 ,  11.   ,   6.   ]])

In [25]:
#get the dimensions of the array
wines.shape

(1599, 12)

In [27]:
#create an empty numpy array
empty_array = np.zeros((3,4))
empty_array

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

In [46]:
#create a numpy 2d array with random numbers
print(np.random.rand(4,4))
#similarly create a 1d array
print(np.random.rand(4))

[[ 0.82703479  0.76655068  0.9902614   0.27930781]
 [ 0.87107552  0.53851953  0.46679055  0.11197686]
 [ 0.16011837  0.24681891  0.88815951  0.28904628]
 [ 0.56119932  0.81842323  0.67745728  0.70792732]]
[ 0.18588318  0.73764738  0.10122692  0.66757099]


In [31]:
#read a csv file directly to numpy array
#Use the genfromtxt function to read in the winequality-red.csv file.
#Specify the keyword argument delimiter=";" so that the fields are parsed properly.
#Specify the keyword argument skip_header=1 so that the header row is skipped.
wines = np.genfromtxt('winequality-red.csv',delimiter=',',skip_header=1)
wines

array([[  7.4  ,   0.7  ,   0.   , ...,   0.56 ,   9.4  ,   5.   ],
       [  7.8  ,   0.88 ,   0.   , ...,   0.68 ,   9.8  ,   5.   ],
       [  7.8  ,   0.76 ,   0.04 , ...,   0.65 ,   9.8  ,   5.   ],
       ..., 
       [  6.3  ,   0.51 ,   0.13 , ...,   0.75 ,  11.   ,   6.   ],
       [  5.9  ,   0.645,   0.12 , ...,   0.71 ,  10.2  ,   5.   ],
       [  6.   ,   0.31 ,   0.47 , ...,   0.66 ,  11.   ,   6.   ]])

NumPy is zero-indexed, meaning that the index of the first row is 0, and the index of the first column is 0. If we want to work with the fourth row, we’d use index 3, if we want to work with the second row, we’d use index 1, and so on

In [33]:
#select the first three items from the fourth column
print(wines[0:3,3])
#this will be same as printing all the items from beginning to 3rd row for the fourth column
print(wines[:3,3])

[ 1.9  2.6  2.3]
[ 1.9  2.6  2.3]


In [38]:
#select the entire 4th column
print(wines[:,3])
#select the entire 5th row
print(wines[4,:])
#we can overwrite an entire column
wines[:,11] = 50

[ 1.9  2.6  2.3 ...,  2.3  2.   3.6]
[  7.4      0.7      0.       1.9      0.076   11.      34.       0.9978
   3.51     0.56     9.4      5.    ]


In [43]:
#insert a column before column 3. All the vlues in this column will b 0
np.insert(wines,3,values=0,axis=1)
#instead of 0s, we can inser a vector as well. Note that each column in the numpy array will be 1d array, or a vector
np.insert(wines,3,values = wines[:,3],axis = 1)

array([[  7.40000000e+00,   7.00000000e-01,   0.00000000e+00, ...,
          5.60000000e-01,   9.40000000e+00,   5.00000000e+01],
       [  7.80000000e+00,   8.80000000e-01,   0.00000000e+00, ...,
          6.80000000e-01,   9.80000000e+00,   5.00000000e+01],
       [  7.80000000e+00,   7.60000000e-01,   4.00000000e-02, ...,
          6.50000000e-01,   9.80000000e+00,   5.00000000e+01],
       ..., 
       [  6.30000000e+00,   5.10000000e-01,   1.30000000e-01, ...,
          7.50000000e-01,   1.10000000e+01,   5.00000000e+01],
       [  5.90000000e+00,   6.45000000e-01,   1.20000000e-01, ...,
          7.10000000e-01,   1.02000000e+01,   5.00000000e+01],
       [  6.00000000e+00,   3.10000000e-01,   4.70000000e-01, ...,
          6.60000000e-01,   1.10000000e+01,   5.00000000e+01]])

In [45]:
#store the 3rd row of wines into a 1d array
third_wine = wines[3,:]
print(third_wine)
#get the 5th element
print(third_wine[4])

[ 11.2     0.28    0.56    1.9     0.075  17.     60.      0.998   3.16
   0.58    9.8    50.   ]
0.075


In [3]:
#converting from float to int
print("datatype of wines array =",wines.dtype)
int_wines = wines.astype(int)
print("datatype of int_wines array =",int_wines.dtype.name)

datatype of wines array = float64
datatype of int_wines array = int32


In [15]:
#split a 1d array two equal n chunks
array1 =  [500, 505, 490, 810, 450, 678, 234, 897, 430, 560, 1023, 640]
array_split = np.array_split(array1,4)
print(array_split)
#the above will be a list. So we need to convert to an array
array_split = np.array(array_split,dtype = np.int)
print("Dimensions of the n-dimensional numpy array: ",array_split.shape)
#print the 3rd element in row
print("3rd element of 2nd row: ",array_split[1,2])

[array([500, 505, 490]), array([810, 450, 678]), array([234, 897, 430]), array([ 560, 1023,  640])]
Dimensions of the n-dimensional numpy array:  (4, 3)
3rd element of 2nd row:  678


In [22]:
#Note the difference in the following two operations
# add 10 points to each quality score
print(wines[:,11]+10)
#the following will perform the same operation and modify the quality column inplace
wines[:,11] += 10
print(wines[:,11])

[ 15.  15.  15. ...,  16.  15.  16.]
[ 15.  15.  15. ...,  16.  15.  16.]


In [32]:
#multiple two columns of an array
print(wines[:,10] * wines[:,11])
#calculate sum of all the elements for a column
print("total quality score for all wines: ",int(wines[:,11].sum()))

[ 47.  49.  49. ...,  66.  51.  66.]
total quality score for all wines:  9012


Unless the arrays that you’re operating on are the exact same size, it’s not possible to do elementwise operations. In cases like this, NumPy performs broadcasting to try to match up elements. Essentially, broadcasting involves a few steps:

    The last dimension of each array is compared.
        If the dimension lengths are equal, or one of the dimensions is of length 1, then we keep going.
        If the dimension lengths aren’t equal, and none of the dimensions have length 1, then there’s an error.
    Continue checking dimensions until the shortest array is out of dimensions.

For example, the following two shapes are compatible:

A: (50,3)
B  (3,)

This is because the length of the trailing dimension of array A is 3, and the length of the trailing dimension of array B is 3. They’re equal, so that dimension is okay. Array B is then out of elements, so we’re okay, and the arrays are compatible for mathematical operations.

The following two shapes are also compatible:

A: (1,2)
B  (50,2)

The last dimension matches, and A is of length 1 in the first dimension.

These two arrays don’t match:

A: (50,50)
B: (49,49)

The lengths of the dimensions aren’t equal, and neither array has either dimension length equal to 1

In [30]:
#the following two arrays can be added since the last dimensions of both the arrays match
array1 = np.array(
[
        [1,2],
        [3,4]
    ]
)
print(array1.shape)
array2 = np.array([4,5])
print(array2.shape)
print(len(array2))
print(array1 + array2)

(2, 2)
(2,)
2
[[5 7]
 [7 9]]


In [35]:
#the '0' axis relates to rows, '1' axis relates to columns.
#for finding sum of all elements in each Column, we specify axis = 0. This is to indicate that calculate the sum of all rows
# for each column. The resulting sums are printed in the form of a 1d array
print(wines.sum(axis = 0))
#similarly for printing the sum across each row, we specify axis = 1
print(wines.sum(axis = 1))

[ 13303.1        843.985      433.29      4059.55       139.859    25384.
  74302.        1593.79794   5294.47      1052.38     16666.35      9012.     ]
[  74.5438   123.0548    99.699   ...,  100.48174  105.21547   92.49249]


There are several other methods that behave like the sum method, including:

    numpy.ndarray.mean – finds the mean of an array.
    numpy.ndarray.std – finds the standard deviation of an array.
    numpy.ndarray.min – finds the minimum value in an array.
    numpy.ndarray.max – finds the maximum value in an array.


In [37]:
# for example, find the max. element in each row
wines.max(axis = 1)

array([ 34.,  67.,  54., ...,  40.,  44.,  42.])

In [38]:
#check which elements in 2nd row are greater than 5
wines[4,:] > 5

array([ True, False, False, False, False,  True,  True, False, False,
       False,  True, False], dtype=bool)

In [41]:
#print the first 3 rows of the wines whose quality rating is more than 6
high_quality = wines[:,11] > 6 #gives the index(as True/False) of all rows where quality is > 6
print(wines[high_quality,:][:3,:])

[[  7.30000000e+00   6.50000000e-01   0.00000000e+00   1.20000000e+00
    6.50000000e-02   1.50000000e+01   2.10000000e+01   9.94600000e-01
    3.39000000e+00   4.70000000e-01   1.00000000e+01   7.00000000e+00]
 [  7.80000000e+00   5.80000000e-01   2.00000000e-02   2.00000000e+00
    7.30000000e-02   9.00000000e+00   1.80000000e+01   9.96800000e-01
    3.36000000e+00   5.70000000e-01   9.50000000e+00   7.00000000e+00]
 [  8.50000000e+00   2.80000000e-01   5.60000000e-01   1.80000000e+00
    9.20000000e-02   3.50000000e+01   1.03000000e+02   9.96900000e-01
    3.30000000e+00   7.50000000e-01   1.05000000e+01   7.00000000e+00]]


In [48]:
# print wines with a lot of alcohol(say , >10) and high quality. Print only the last two columns and first 5 rows
high_quality_lot_alcohol = (wines[:,11]>6) & (wines[:,10] >10)
print(wines[high_quality_lot_alcohol,10:][:5,:])

[[ 10.5   7. ]
 [ 10.5   7. ]
 [ 13.    7. ]
 [ 10.3   7. ]
 [ 10.8   7. ]]


In [52]:
#transpose the matrix
print(np.transpose(wines).shape)
#flatten the array from 2d to 1d
print(np.ravel(wines))

(12, 1599)
[  7.4    0.7    0.   ...,   0.66  11.     6.  ]


In [54]:
#turn the second row of wines into a 2-dimensional array with 2 rows and 6 columns
print(wines[1,:].reshape((2,6)))

[[  7.8      0.88     0.       2.6      0.098   25.    ]
 [ 67.       0.9968   3.2      0.68     9.8      5.    ]]


In [59]:
#subset the last 100 rows of wines and write it to wines-last100.csv
np.savetxt("wines-last100.csv",wines[-100:,],delimiter= ',')
#subset the first 100 rows of wines and write it to wines-first100.csv
np.savetxt("wines-first100.csv",wines[:100,],delimiter= ',')

In [66]:
#combine the first 100 rows of wines and last 100 rows of wine Vertically
print(np.vstack((wines[:100,],wines[-100:,])).shape)
#combine the first 100 rows of wines and last 100 rows of wine Horizontally
print(np.hstack((wines[:100,],wines[-100:,])).shape)
#alternatively, use concatenate. For this, axis =0 -> vstack and axis = 1 -> hstack
print(np.concatenate((wines[:100,],wines[-100:,]),axis = 0).shape)

(200, 12)
(100, 24)
(200, 12)
