In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from numpy.random import rand
from sklearn.metrics.pairwise import euclidean_distances

<p>Panda has series and dataframes</p>

<h1> Series </h1>
<p> 1D Cell</p>
Position on labels are not the same things


In [5]:
s = pd.Series([2,4,-12, 0, 2])
s

0     2
1     4
2   -12
3     0
4     2
dtype: int64

In [9]:
s.shape

(5,)

In [10]:
s.dtype

dtype('int64')

In [11]:
#Convert series into numpy arrays
s.values

array([  2,   4, -12,   0,   2])

In [13]:
#Indexing
s[3]

np.int64(0)

In [15]:
# Boolean indexing
# Give a list of bools 
# Very powerful and common

s[[True, True, False, False, True]]

0    2
1    4
4    2
dtype: int64

In [16]:
# Element Wise
s * 2

0     4
1     8
2   -24
3     0
4     4
dtype: int64

In [17]:
# Goes through S and compares each element
# Produces a series of datastructure

s > 0

0     True
1     True
2    False
3    False
4     True
dtype: bool

In [18]:
# This is a series/list of booleans
s[s > 0]

0    2
1    4
4    2
dtype: int64

In [None]:
# Could do
s.sum()
s.mean()


In [24]:
# Couldn't mean
s.unique()


array([  2,   4, -12,   0])

In [20]:
s.value_counts()

 2     2
 4     1
-12    1
 0     1
Name: count, dtype: int64

In [23]:
# Copy and convert data from integers to floats
s.astype(float)

0     2.0
1     4.0
2   -12.0
3     0.0
4     2.0
dtype: float64

<h1> DataFrame </h1>
<p> 2D Data Structure </p>

In [70]:
df = pd.read_csv("../datasets/dataset_stop_and_searchA.csv")

In [43]:
df

Unnamed: 0,Gender,Age,Suspect-ethnicity,Officer-ethnicity,Object-of-search,Outcome
0,Male,,Other ethnic group - Not stated,Black,Offensive weapons,A no further action disposal
1,Male,18-24,Other ethnic group - Not stated,Asian,Controlled drugs,A no further action disposal
2,Male,18-24,Other ethnic group - Not stated,Asian,Controlled drugs,A no further action disposal
3,Male,18-24,Other ethnic group - Not stated,Asian,Controlled drugs,Community resolution
4,Male,18-24,Other ethnic group - Not stated,Asian,Controlled drugs,Community resolution
5,Male,18-24,Other ethnic group - Not stated,White,Controlled drugs,A no further action disposal
6,Male,18-24,White - Any other White background,White,Controlled drugs,A no further action disposal
7,Male,25-34,Asian/Asian British - Indian,Asian,Stolen goods,Arrest
8,Male,18-24,White - Any other White background,White,Controlled drugs,A no further action disposal
9,Male,,Other ethnic group - Any other ethnic group,Black,Controlled drugs,A no further action disposal


In [27]:
df.columns

Index(['Gender', 'Age', 'Suspect-ethnicity', 'Officer-ethnicity',
       'Object-of-search', 'Outcome'],
      dtype='object')

In [28]:
df.index

RangeIndex(start=0, stop=40, step=1)

In [29]:
df.shape

(40, 6)

In [30]:
# Strings = objects (Assuming its Java)
df.dtypes

Gender               object
Age                  object
Suspect-ethnicity    object
Officer-ethnicity    object
Object-of-search     object
Outcome              object
dtype: object

In [32]:
# 36 is made up of strings and numbers, strings and NaN's
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Gender             40 non-null     object
 1   Age                36 non-null     object
 2   Suspect-ethnicity  40 non-null     object
 3   Officer-ethnicity  40 non-null     object
 4   Object-of-search   40 non-null     object
 5   Outcome            40 non-null     object
dtypes: object(6)
memory usage: 2.0+ KB


In [34]:
df.describe(include="all")

Unnamed: 0,Gender,Age,Suspect-ethnicity,Officer-ethnicity,Object-of-search,Outcome
count,40,36,40,40,40,40
unique,2,4,10,4,4,3
top,Male,18-24,Other ethnic group - Not stated,Black,Controlled drugs,A no further action disposal
freq,37,22,12,16,22,24


In [36]:
# Extract 1 Column (Returns a series)
df["Suspect-ethnicity"]

0                       Other ethnic group - Not stated
1                       Other ethnic group - Not stated
2                       Other ethnic group - Not stated
3                       Other ethnic group - Not stated
4                       Other ethnic group - Not stated
5                       Other ethnic group - Not stated
6                    White - Any other White background
7                          Asian/Asian British - Indian
8                    White - Any other White background
9           Other ethnic group - Any other ethnic group
10                   White - Any other White background
11    Black/African/Caribbean/Black British - Any ot...
12      Black/African/Caribbean/Black British - African
13                   White - Any other White background
14                   White - Any other White background
15    Black/African/Caribbean/Black British - Caribbean
16                      Other ethnic group - Not stated
17    Black/African/Caribbean/Black British - Ca

In [37]:
df["Suspect-ethnicity"].unique()

array(['Other ethnic group - Not stated',
       'White - Any other White background',
       'Asian/Asian British - Indian',
       'Other ethnic group - Any other ethnic group',
       'Black/African/Caribbean/Black British - Any other Black/African/Caribbean background',
       'Black/African/Caribbean/Black British - African',
       'Black/African/Caribbean/Black British - Caribbean',
       'Mixed/Multiple ethnic groups - White and Black African',
       'White - English/Welsh/Scottish/Northern Irish/British',
       'Asian/Asian British - Any other Asian background'], dtype=object)

In [38]:
df["Suspect-ethnicity"].value_counts()

Suspect-ethnicity
Other ethnic group - Not stated                                                         12
White - Any other White background                                                       7
Black/African/Caribbean/Black British - Any other Black/African/Caribbean background     5
Asian/Asian British - Any other Asian background                                         4
White - English/Welsh/Scottish/Northern Irish/British                                    4
Black/African/Caribbean/Black British - Caribbean                                        3
Black/African/Caribbean/Black British - African                                          2
Other ethnic group - Any other ethnic group                                              1
Asian/Asian British - Indian                                                             1
Mixed/Multiple ethnic groups - White and Black African                                   1
Name: count, dtype: int64

In [39]:
df[["Suspect-ethnicity", "Officer-ethnicity"]]

Unnamed: 0,Suspect-ethnicity,Officer-ethnicity
0,Other ethnic group - Not stated,Black
1,Other ethnic group - Not stated,Asian
2,Other ethnic group - Not stated,Asian
3,Other ethnic group - Not stated,Asian
4,Other ethnic group - Not stated,Asian
5,Other ethnic group - Not stated,White
6,White - Any other White background,White
7,Asian/Asian British - Indian,Asian
8,White - Any other White background,White
9,Other ethnic group - Any other ethnic group,Black


In [41]:
# Extract Rows/Position

df.iloc[4]

Gender                                          Male
Age                                            18-24
Suspect-ethnicity    Other ethnic group - Not stated
Officer-ethnicity                              Asian
Object-of-search                    Controlled drugs
Outcome                         Community resolution
Name: 4, dtype: object

In [42]:
df.loc[4]

Gender                                          Male
Age                                            18-24
Suspect-ethnicity    Other ethnic group - Not stated
Officer-ethnicity                              Asian
Object-of-search                    Controlled drugs
Outcome                         Community resolution
Name: 4, dtype: object

In [44]:
# If something is deleted the position is changed but not the index
# OR if it is shuffled


In [45]:
# Extract rows on condition
df["Officer-ethnicity"] == "Black"

0      True
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9      True
10    False
11     True
12     True
13    False
14    False
15     True
16    False
17    False
18     True
19     True
20     True
21    False
22     True
23     True
24    False
25     True
26    False
27    False
28     True
29    False
30     True
31    False
32    False
33    False
34     True
35     True
36    False
37    False
38    False
39     True
Name: Officer-ethnicity, dtype: bool

In [46]:
# Boolean indexing
# Inside is a series of boolean arrays
df[df["Officer-ethnicity"] == "Black"]

Unnamed: 0,Gender,Age,Suspect-ethnicity,Officer-ethnicity,Object-of-search,Outcome
0,Male,,Other ethnic group - Not stated,Black,Offensive weapons,A no further action disposal
9,Male,,Other ethnic group - Any other ethnic group,Black,Controlled drugs,A no further action disposal
11,Female,18-24,Black/African/Caribbean/Black British - Any ot...,Black,Stolen goods,A no further action disposal
12,Male,10-17,Black/African/Caribbean/Black British - African,Black,Offensive weapons,A no further action disposal
15,Male,25-34,Black/African/Caribbean/Black British - Caribbean,Black,Offensive weapons,A no further action disposal
18,Male,25-34,Black/African/Caribbean/Black British - Any ot...,Black,Controlled drugs,A no further action disposal
19,Male,25-34,Other ethnic group - Not stated,Black,Controlled drugs,A no further action disposal
20,Male,18-24,Mixed/Multiple ethnic groups - White and Black...,Black,Controlled drugs,A no further action disposal
22,Female,18-24,White - English/Welsh/Scottish/Northern Irish/...,Black,Controlled drugs,Community resolution
23,Male,18-24,Black/African/Caribbean/Black British - African,Black,Controlled drugs,Arrest


In [50]:
# Extract rows with multiple requirements
df[(df["Officer-ethnicity"] == "Black") & (df["Object-of-search"] == "Stolen goods")]

Unnamed: 0,Gender,Age,Suspect-ethnicity,Officer-ethnicity,Object-of-search,Outcome
11,Female,18-24,Black/African/Caribbean/Black British - Any ot...,Black,Stolen goods,A no further action disposal
30,Male,18-24,Black/African/Caribbean/Black British - Any ot...,Black,Stolen goods,Arrest


In [71]:
# How to delete rows from DF
df = df[ df["Gender"] == "Female"].copy()

In [61]:
df

Unnamed: 0,Gender,Age,Suspect-ethnicity,Officer-ethnicity,Object-of-search,Outcome
11,Female,18-24,Black/African/Caribbean/Black British - Any ot...,Black,Stolen goods,A no further action disposal
22,Female,18-24,White - English/Welsh/Scottish/Northern Irish/...,Black,Controlled drugs,Community resolution
27,Female,18-24,Asian/Asian British - Any other Asian background,Asian,Controlled drugs,A no further action disposal


In [62]:
# Position
df.iloc[0]

Gender                                                          Female
Age                                                              18-24
Suspect-ethnicity    Black/African/Caribbean/Black British - Any ot...
Officer-ethnicity                                                Black
Object-of-search                                          Stolen goods
Outcome                                   A no further action disposal
Name: 11, dtype: object

In [64]:
# Error, no index 0
df.loc[0]

KeyError: 0

In [66]:
df.reset_index(drop=True, inplace=True)

In [67]:
df

Unnamed: 0,Gender,Age,Suspect-ethnicity,Officer-ethnicity,Object-of-search,Outcome
0,Female,18-24,Black/African/Caribbean/Black British - Any ot...,Black,Stolen goods,A no further action disposal
1,Female,18-24,White - English/Welsh/Scottish/Northern Irish/...,Black,Controlled drugs,Community resolution
2,Female,18-24,Asian/Asian British - Any other Asian background,Asian,Controlled drugs,A no further action disposal


In [72]:
# Deleting Columns
df = df[["Gender", "Age", "Object-of-search", "Outcome"]].copy() 

In [73]:
df

Unnamed: 0,Gender,Age,Object-of-search,Outcome
11,Female,18-24,Stolen goods,A no further action disposal
22,Female,18-24,Controlled drugs,Community resolution
27,Female,18-24,Controlled drugs,A no further action disposal


In [76]:
# Drop is more efficient
df.drop("Age", axis=1, inplace=True)

KeyError: "['Age'] not found in axis"

In [75]:
df

Unnamed: 0,Gender,Object-of-search,Outcome
11,Female,Stolen goods,A no further action disposal
22,Female,Controlled drugs,Community resolution
27,Female,Controlled drugs,A no further action disposal


In [None]:
# Dropping NaNs
dropna