# Panda
## Summary
- Get to know relevant dependencies like numpy, panda, matplotlib, seaborn, scikit-learn
- Reading Csv
- Dataframes of Panda
- Grouping for e.g. Medians of Categories

## Content
- Install Dependencies (Read Csv)
- Multi Layer Random Sampling
- Working with Panda

## Install Dependencies

In [1]:
import numpy as np
print('Numpy version: ',np.__version__)
import pandas as pd
print('Panas version: ', pd.__version__)
import matplotlib
print('Matplotlib version: ', matplotlib.__version__)
import seaborn as sns
print('Searborn version: ', sns.__version__)
import sklearn
print('Sci-kitlearn version: ', sklearn.__version__)

Numpy version:  1.16.2
Panas version:  0.24.2
Matplotlib version:  3.0.3
Searborn version:  0.9.0
Sci-kitlearn version:  0.20.3


In [2]:
with open('../data/brent.txt','r') as f:
    brent = f.read().splitlines()

In [3]:
brent[0:5]

['Brent Park', 'Church End', 'Dollis Hill', 'Harlesden', 'Kensal Green']

## Multi Layer Random Sampling 

In [4]:
business_ids = np.arange(1,1701)
business_ids.shape

(1700,)

In [5]:
# split in 10 groups
id_matrix = np.reshape(business_ids,(10,170))
id_matrix.shape

(10, 170)

In [6]:
print(id_matrix[0,:])
print(id_matrix.shape)

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170]
(10, 170)


In [7]:
id_matrix = id_matrix.T
print(id_matrix[:,0])
print(id_matrix.shape)

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170]
(170, 10)


In [8]:
# Create group_min
group_min = np.min(id_matrix, axis=0)
# Print group_min
print (group_min)
# Create group_max
group_max = np.max(id_matrix, axis=0)
# Print group_max
print (group_max)

[   1  171  341  511  681  851 1021 1191 1361 1531]
[ 170  340  510  680  850 1020 1190 1360 1530 1700]


In [9]:
np.random.seed(9001)
for group in range(id_matrix.shape[1]):
    print('Group {}: {}'.format(group+1, np.random.choice(id_matrix[:,group], 5, replace=False)))

Group 1: [  7  37 124  41  17]
Group 2: [302 261 257 323 234]
Group 3: [464 486 463 440 474]
Group 4: [645 582 666 631 553]
Group 5: [699 738 705 760 792]
Group 6: [ 995 1000  909  869  899]
Group 7: [1042 1058 1172 1122 1153]
Group 8: [1304 1275 1343 1344 1236]
Group 9: [1401 1417 1383 1387 1474]
Group 10: [1642 1687 1545 1644 1549]


## Working with Panda
Inspection of core functionalities of head and tail, median and group_by

In [10]:
df = pd.read_csv('../data/iris.csv')

In [11]:
toy_df = pd.concat([df.head(), df.tail()])

In [12]:
toy_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [13]:
toy_df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,10.0,10.0,10.0,10.0
mean,5.59,3.13,3.29,1.13
std,0.807534,0.316403,1.995244,0.992248
min,4.6,2.5,1.3,0.2
25%,4.925,3.0,1.4,0.2
50%,5.5,3.05,3.25,1.0
75%,6.275,3.35,5.175,1.975
max,6.7,3.6,5.4,2.3


In [14]:
toy_df[['petal_width', 'petal_length']]

Unnamed: 0,petal_width,petal_length
0,0.2,1.4
1,0.2,1.4
2,0.2,1.3
3,0.2,1.5
4,0.2,1.4
145,2.3,5.2
146,1.9,5.0
147,2.0,5.2
148,2.3,5.4
149,1.8,5.1


In [15]:
toy_df['petal_area'] = toy_df['petal_width'] * toy_df['petal_length']

toy_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,petal_area
0,5.1,3.5,1.4,0.2,setosa,0.28
1,4.9,3.0,1.4,0.2,setosa,0.28
2,4.7,3.2,1.3,0.2,setosa,0.26
3,4.6,3.1,1.5,0.2,setosa,0.3
4,5.0,3.6,1.4,0.2,setosa,0.28
145,6.7,3.0,5.2,2.3,virginica,11.96
146,6.3,2.5,5.0,1.9,virginica,9.5
147,6.5,3.0,5.2,2.0,virginica,10.4
148,6.2,3.4,5.4,2.3,virginica,12.42
149,5.9,3.0,5.1,1.8,virginica,9.18


In [16]:
petal_area_mask = toy_df['petal_area'] > 10
petal_area_mask

0      False
1      False
2      False
3      False
4      False
145     True
146    False
147     True
148     True
149    False
Name: petal_area, dtype: bool

In [17]:
sepal_width_mask = toy_df['sepal_width'] > 3
sepal_width_mask

0       True
1      False
2       True
3       True
4       True
145    False
146    False
147    False
148     True
149    False
Name: sepal_width, dtype: bool

In [18]:
toy_df[sepal_width_mask & petal_area_mask]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,petal_area
148,6.2,3.4,5.4,2.3,virginica,12.42


In [19]:
toy_df[toy_df['species'] == 'virginica'].sort_values('petal_area', axis=0, ascending='True')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,petal_area
149,5.9,3.0,5.1,1.8,virginica,9.18
146,6.3,2.5,5.0,1.9,virginica,9.5
147,6.5,3.0,5.2,2.0,virginica,10.4
145,6.7,3.0,5.2,2.3,virginica,11.96
148,6.2,3.4,5.4,2.3,virginica,12.42


In [20]:
toy_df[toy_df['species'] == 'setosa'].sort_values('petal_area', axis=0, ascending='True')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,petal_area
2,4.7,3.2,1.3,0.2,setosa,0.26
0,5.1,3.5,1.4,0.2,setosa,0.28
1,4.9,3.0,1.4,0.2,setosa,0.28
4,5.0,3.6,1.4,0.2,setosa,0.28
3,4.6,3.1,1.5,0.2,setosa,0.3


In [21]:
toy_df.groupby('species')['petal_area'].median()

species
setosa        0.28
virginica    10.40
Name: petal_area, dtype: float64