In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest
import plotly.express as px
from sklearn.decomposition import PCA

In [None]:
# will have to adjust path, based on where the CSV file with the live evaluation entries is saved on your private google drive.
data_path = Path('/content/gdrive/MyDrive/CSIRO_workshop/Dataanalysis_workshop.csv')

In [None]:
df = pd.read_csv(data_path)

In [None]:
df

Unnamed: 0,State,Year of Birth (YYYY),Country of Birth,highest academic degree,Identity as aboriginal/torres strait islander,Citizen/PR,First Language,gender
0,VIC,1997,Germany,Master Degree,No,No,german,woman
1,WA,1972,USA,Master Degree,No,Yes,english,woman
2,NSW,1995,Australia,Bachelor Degree,No,Yes,english,woman
3,WA,1999,Israel,High School Diploma,No,Yes,hebrew,man
4,WA,2000,Australia,High School Diploma,No,Yes,english,man
5,WA,1990,Australia,Master Degree,No,Yes,english,man
6,WA,2001,Germany,High School Diploma,No,Yes,german,man
7,WA,1992,Australia,High School Diploma,No,Yes,english,man
8,WA,1994,Australia,Bachelor Degree,No,Yes,english,man


In [None]:
le = OrdinalEncoder()
df_transformed = le.fit_transform(df)

In [None]:
le = LabelEncoder()
df_transformed = df.copy()
for col in df_transformed.select_dtypes(include='O').columns:
    df_transformed[col]=le.fit_transform(df_transformed[col])

In [None]:
df_transformed

array([[1., 5., 1., 2., 0., 0., 1., 1.],
       [2., 0., 3., 2., 0., 1., 0., 1.],
       [0., 4., 0., 0., 0., 1., 0., 1.],
       [2., 6., 2., 1., 0., 1., 2., 0.],
       [2., 7., 0., 1., 0., 1., 0., 0.],
       [2., 1., 0., 2., 0., 1., 0., 0.],
       [2., 8., 1., 1., 0., 1., 1., 0.],
       [2., 2., 0., 1., 0., 1., 0., 0.],
       [2., 3., 0., 0., 0., 1., 0., 0.]])

In [None]:
clf = IsolationForest(random_state=0).fit_predict(df_transformed)

In [None]:
df['anomaly'] = clf

In [None]:
df

Unnamed: 0,State,Year of Birth (YYYY),Country of Birth,highest academic degree,Identity as aboriginal/torres strait islander,Citizen/PR,First Language,gender,anomaly
0,VIC,1997,Germany,Master Degree,No,No,german,woman,-1
1,WA,1972,USA,Master Degree,No,Yes,english,woman,-1
2,NSW,1995,Australia,Bachelor Degree,No,Yes,english,woman,-1
3,WA,1999,Israel,High School Diploma,No,Yes,hebrew,man,-1
4,WA,2000,Australia,High School Diploma,No,Yes,english,man,1
5,WA,1990,Australia,Master Degree,No,Yes,english,man,1
6,WA,2001,Germany,High School Diploma,No,Yes,german,man,1
7,WA,1992,Australia,High School Diploma,No,Yes,english,man,1
8,WA,1994,Australia,Bachelor Degree,No,Yes,english,man,1


In [None]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_transformed)
df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1]
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

Explained variation per principal component: [0.72673848 0.14813161]


In [None]:
df

Unnamed: 0,State,Year of Birth (YYYY),Country of Birth,highest academic degree,Identity as aboriginal/torres strait islander,Citizen/PR,First Language,gender,anomaly,pca-one,pca-two
0,VIC,1997,Germany,Master Degree,No,No,german,woman,-1,-0.965266,0.772141
1,WA,1972,USA,Master Degree,No,Yes,english,woman,-1,4.236012,2.000666
2,NSW,1995,Australia,Bachelor Degree,No,Yes,english,woman,-1,-0.067963,-1.445862
3,WA,1999,Israel,High School Diploma,No,Yes,hebrew,man,-1,-2.147357,1.597724
4,WA,2000,Australia,High School Diploma,No,Yes,english,man,1,-2.941025,-0.721377
5,WA,1990,Australia,Master Degree,No,Yes,english,man,1,3.037246,-0.544903
6,WA,2001,Germany,High School Diploma,No,Yes,german,man,1,-4.016732,0.49121
7,WA,1992,Australia,High School Diploma,No,Yes,english,man,1,1.967444,-0.898167
8,WA,1994,Australia,Bachelor Degree,No,Yes,english,man,1,0.897641,-1.251432


In [None]:
df["anomaly"] = df["anomaly"].astype(str)
fig = px.scatter(df, x="pca-one", y="pca-two", color="anomaly",
                 hover_data=['State', 'Country of Birth', 'highest academic degree', 'Identity as aboriginal/torres strait islander', 'Citizen/PR'],
                 color_discrete_sequence=["indianred", "mediumblue"])

fig.show()