In [1]:
import math
import numpy as np
import pandas as pd

# Mathematik und Anwendung

OK, in bestimmten mathematischen "Räumen" haben wir einen Abstand. Der "Raum" gibt unter anderem die Dimension vor. Was machen wir damit aber nun in der Anwendung?!

Wenn wir uns unsere Daten anschauen:

In [2]:
file_name = '~/neuefische/data-fish/data/bank_transactions.csv'
data = pd.read_csv(file_name) 
data.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0,C1093826151,4,M,M348934600,es_transportation,4.55,0
1,0,C352968107,2,M,M348934600,es_transportation,39.68,0
2,0,C2054744914,4,F,M1823072687,es_transportation,26.89,0
3,0,C1760612790,3,M,M348934600,es_transportation,17.25,0
4,0,C757503768,5,M,M348934600,es_transportation,35.72,0


Was ich ja machen will ist die Unterschiede zwischen einzelnen Zeilen erkennen, oder anders, Muster zwischen den Zeilen erkennen. Wie aber messe ich mathematisch den Abstand zwischen zwei Zeilen im Bezug auf das Geschlecht?!

**Antwort: Wir kodieren die Kategorien in verschiedenen Dimensionen!**

# One Hot Encoding

In [3]:
data_one_hot = pd.concat([data, pd.get_dummies(data[['age', 'gender', 'category']])], axis=1, sort=False)

In [4]:
# Überblick über die Spalten:
# data_one_hot.columns

In [5]:
data_one_hot.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud,age_0,age_1,...,category_es_home,category_es_hotelservices,category_es_hyper,category_es_leisure,category_es_otherservices,category_es_sportsandtoys,category_es_tech,category_es_transportation,category_es_travel,category_es_wellnessandbeauty
0,0,C1093826151,4,M,M348934600,es_transportation,4.55,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,C352968107,2,M,M348934600,es_transportation,39.68,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,C2054744914,4,F,M1823072687,es_transportation,26.89,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,C1760612790,3,M,M348934600,es_transportation,17.25,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,C757503768,5,M,M348934600,es_transportation,35.72,0,0,0,...,0,0,0,0,0,0,0,1,0,0


---

## Der Abstand

In [6]:
# die funktion erwartet zwei numpy arrays der gleichen Länge
def euklid_d(x,y):
    d = (x-y) # elementweise differenz
    d = d**2 # elementweise quadrat d^2
    d = d.sum() #die summe aller einträge in d
    return math.sqrt(d) # die Wurzel aus diesem Wert ist der euklidische Abstand

In [7]:
one_hot_columns = ['age_0','age_1','age_2','age_3','age_4','age_5','age_6','age_U','gender_E','gender_F','gender_M','gender_U','category_es_barsandrestaurants','category_es_contents','category_es_fashion','category_es_food','category_es_health','category_es_home','category_es_hotelservices','category_es_hyper','category_es_leisure','category_es_otherservices','category_es_sportsandtoys','category_es_tech','category_es_transportation','category_es_travel','category_es_wellnessandbeauty']

In [8]:
data_one_hot.loc[0:2,one_hot_columns]

Unnamed: 0,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_U,gender_E,gender_F,...,category_es_home,category_es_hotelservices,category_es_hyper,category_es_leisure,category_es_otherservices,category_es_sportsandtoys,category_es_tech,category_es_transportation,category_es_travel,category_es_wellnessandbeauty
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [9]:
x = data_one_hot.loc[0, one_hot_columns].values.astype('int') # cast values to int as get_dummies uses unsigned int (Werte ohne Vorzeichen)
y = data_one_hot.loc[2, one_hot_columns].values.astype('int')

In [10]:
euklid_d(x,y)

1.4142135623730951

**MEGA!**

**Wir können Ähnlichkeiten zwischen Zeilen berechnen!**

---

In welchem Datenformat die Werte abgespeichert sind kann man mit .info() auf einem DataFrame herausfinden

In [11]:
data_one_hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594643 entries, 0 to 594642
Data columns (total 35 columns):
step                              594643 non-null int64
customer                          594643 non-null object
age                               594643 non-null object
gender                            594643 non-null object
merchant                          594643 non-null object
category                          594643 non-null object
amount                            594643 non-null float64
fraud                             594643 non-null int64
age_0                             594643 non-null uint8
age_1                             594643 non-null uint8
age_2                             594643 non-null uint8
age_3                             594643 non-null uint8
age_4                             594643 non-null uint8
age_5                             594643 non-null uint8
age_6                             594643 non-null uint8
age_U                             594643 n