In this notebook we will train a classifier on the number of invariants data, and make the conjecture about the number of invariants on certain Hurwitz curves.

In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

First can just look at the data (we're gonna use pandas)

In [23]:
raw_data = pd.read_csv("group_action_features_invariants_data.csv")
raw_data

Unnamed: 0,g,go,L,p2,ni,nic,nos,nes,mro,df,di
0,2,2,0,1,1,1,0,6,2,3,4
1,2,3,0,0,0,0,4,0,3,1,0
2,2,4,0,2,1,1,0,4,4,1,2
3,2,4,0,2,3,3,0,5,2,2,2
4,2,5,1,0,0,0,3,0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1321,11,8,0,3,5,3,0,9,2,6,6
1322,11,8,0,3,5,3,0,9,2,6,6
1323,11,8,0,3,1,1,0,6,4,3,-1
1324,11,8,0,3,1,1,0,6,4,3,-1


In [24]:
# For the input data remove the data of the dimension of the space of invariants, 
# the final column.
group_input = raw_data[raw_data.columns[:-1]]
group_input

Unnamed: 0,g,go,L,p2,ni,nic,nos,nes,mro,df
0,2,2,0,1,1,1,0,6,2,3
1,2,3,0,0,0,0,4,0,3,1
2,2,4,0,2,1,1,0,4,4,1
3,2,4,0,2,3,3,0,5,2,2
4,2,5,1,0,0,0,3,0,5,0
...,...,...,...,...,...,...,...,...,...,...
1321,11,8,0,3,5,3,0,9,2,6
1322,11,8,0,3,5,3,0,9,2,6
1323,11,8,0,3,1,1,0,6,4,3
1324,11,8,0,3,1,1,0,6,4,3


In [25]:
# We will take two forms of output data, whether there is a unique characteristics, 
# or whether there are none (-1) a unique invariant (0) or many invariants (1)
group_output_1 = raw_data.di == 0

def classify_invs(di):
    if di < 0:
        return -1
    if di == 0:
        return 0
    else:
        return 1
group_output_2 = pd.Series(raw_data.di.apply(classify_invs))

In [26]:
# Input the sample data for the Hurwitz curves
samples = [[3, 168, 1, 3, 21, 1, 2, 1, 7, 0],
[7, 504, 1, 3, 63, 1, 2, 1, 7, 0],
[14, 1092, 1, 2, 91, 1, 2, 1, 7, 0],
[118, 9828, 1, 2, 351, 1, 2, 1, 7, 0],
[146, 12180, 1, 2, 435, 1, 2, 1, 7, 0],
[411, 34440, 1, 3, 861, 1, 2, 1, 7, 0],
[474, 39732, 1, 2, 903, 1, 2, 1, 7, 0],
[2091, 175560, 1, 3, 1463, 1, 2, 1, 7, 0],
[2131, 178920, 1, 3, 2485, 1, 2, 1, 7, 0],
[3404, 285852, 1, 2, 3403, 1, 2, 1, 7, 0],
[5433, 456288, 1, 5, 4753, 1, 2, 1, 7, 0],
[7201, 604800, 1, 7, 2835, 2, 2, 1, 7, 0],
[8589, 721392, 1, 4, 6441, 1, 2, 1, 7, 0],
[11626, 976500, 1, 2, 7875, 1, 2, 1, 7, 0]]

# For each output data, train a random forest classifier, 
outputs = []
for group_output in [group_output_1, group_output_2]:
    rf_pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
    
    # print the average cross validation score over 5 tests
    rf_cv = cross_val_score(rf_pipeline, X=group_input, y=group_output, cv=5)
    print(rf_cv.mean())
    
    rf_pipeline.fit(X=group_input, y=group_output)
    rf_pipeline.steps[1]
    
    # for each Hurwitz curve, print the predicted value of each output
    output_i = []
    for sample in samples:
        DF = pd.DataFrame(sample).transpose()
        DF.columns = group_input.columns
        outi = rf_pipeline.predict(DF)
        output_i.append(outi)
    outputs.append(output_i)
    print()

0.9366491700950489

0.8506568307561355



In [27]:
# Display outputs of Hurwitz predictions
pd.DataFrame(outputs)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,[True],[False],[True],[True],[True],[False],[True],[False],[False],[True],[False],[False],[False],[True]
1,[0],[-1],[0],[0],[0],[-1],[0],[-1],[-1],[0],[-1],[-1],[-1],[0]


In [28]:
# On the first input, which was more accurate, see the feature importance. 
rf_pipeline.fit(X=group_input, y=group_output_1)
rf_pipeline.steps[1]
FI = rf_pipeline.steps[1][1].feature_importances_
Fs = group_input.columns

sorted(list(zip(Fs, FI)), key = lambda v: v[1])

[('L', 0.008478517513147925),
 ('df', 0.03076210704220071),
 ('mro', 0.03205776484534006),
 ('g', 0.03643602688095254),
 ('go', 0.045163522234040625),
 ('ni', 0.08006595785016433),
 ('nic', 0.08411129410583575),
 ('p2', 0.11052923122232702),
 ('nos', 0.1631743191039947),
 ('nes', 0.4092212592019963)]

In [29]:
# Now repeat analysis removing the Hurwitz curves contained in the input data. 
raw_data = pd.read_csv("group_action_features_invariants_data_sans_Hurwitz.csv")
group_input = raw_data[raw_data.columns[:-1]]

group_output_1 = raw_data.di == 0

def classify_invs(di):
    if di < 0:
        return -1
    if di == 0:
        return 0
    else:
        return 1
group_output_2 = pd.Series(raw_data.di.apply(classify_invs))

for group_output in [group_output_1, group_output_2]:
    rf_pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
    rf_cv = cross_val_score(rf_pipeline, X=group_input, y=group_output, cv=5)
    print(rf_cv.mean())
    
    rf_pipeline.fit(X=group_input, y=group_output)
    rf_pipeline.steps[1]
    
    output_i = []
    for sample in samples:
        DF = pd.DataFrame(sample).transpose()
        DF.columns = group_input.columns
        outi = rf_pipeline.predict(DF)
        output_i.append(outi)
    outputs.append(output_i)
    print()

0.9282475700400228

0.8466809605488852



In [30]:
# Display new output of Hurwitz predictions with old data still present to compare
pd.DataFrame(outputs)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,[True],[False],[True],[True],[True],[False],[True],[False],[False],[True],[False],[False],[False],[True]
1,[0],[-1],[0],[0],[0],[-1],[0],[-1],[-1],[0],[-1],[-1],[-1],[0]
2,[False],[False],[True],[True],[True],[False],[True],[False],[False],[True],[False],[True],[False],[True]
3,[0],[0],[0],[0],[0],[-1],[0],[-1],[-1],[0],[-1],[-1],[-1],[0]
