In [1]:
import os
import pandas as pd
import numpy as np


main_publisher = 'MIT'

script_dir = os.path.dirname(os.path.realpath('__file__'))
path = os.path.join(script_dir, '../Data/' + main_publisher + '/')

df_chapters = pd.read_csv(path + 'chapters.csv', delimiter = ',')
df_chapters_embeddings = pd.read_csv(path + 'embeddings_chapters.csv', delimiter = '|', index_col=0)
df_concepts = pd.read_csv(path + 'concepts_bis.csv', delimiter = '|')
df_concepts_embeddings = pd.read_csv(path + 'embeddings_concepts_bis.csv', delimiter = '|', index_col=0)
df_classes = pd.read_csv(path + 'classes_bis.csv', delimiter = '|')
df_classes_embeddings = pd.read_csv(path + 'embeddings_classes_bis.csv', delimiter = '|', index_col=0)
df_precedences_episodes = pd.read_csv(path + 'precedences_episodes.csv', delimiter = '|')
df_precedences_series = pd.read_csv(path + 'precedences_series.csv', delimiter = '|')

df_concepts['Concept'] = df_concepts['Concept'].apply(lambda x : x.split('/')[-1])

df_classes = df_classes.dropna()
print(f'{df_chapters["Cid"].isna().sum().sum():04d} NaN values in chapters.')
print(f'{df_concepts.isna().sum().sum():04d} Nan values in concepts.')
print(f'{df_classes.isna().sum().sum():04d} Nan values in classes.')
print(f'{df_precedences_episodes.isna().sum().sum():04d} Nan values in episdes precedences.')
print(f'{df_precedences_series.isna().sum().sum():04d} Nan values in series precedences.')

0000 NaN values in chapters.
0000 Nan values in concepts.
0000 Nan values in classes.
0000 Nan values in episdes precedences.
0000 Nan values in series precedences.


In [2]:
from utils import *

unique_oer_id = id_mapper(df_chapters['Cid'], 'OER')
unique_concept_id =  id_mapper(df_concepts['Concept'], 'Concept')
unique_class_id =  id_mapper(df_classes['Class'], 'Class')

In [3]:
oer_covers_concept_subject = edge_construction(df1 = df_concepts, df2 = unique_oer_id, col = 'mappedID', 
                                       how = 'left', right_on = 'OER')
oer_covers_concept_pr = edge_construction(df1 = df_concepts, df2 = unique_oer_id, col = 'PR', 
                                          how = 'right', right_on = 'OER')
oer_covers_concept_object = edge_construction(df1 = df_concepts, df2 = unique_concept_id, col = 'mappedID', 
                                       how = 'left', right_on = 'Concept')

oer_before_oer_ep_subject = edge_construction(df1 = df_precedences_episodes, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Before', right_on = 'OER')
oer_before_oer_ep_object = edge_construction(df1 = df_precedences_episodes, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'After', right_on = 'OER')
oer_before_oer_sr_subject = edge_construction(df1 = df_precedences_series, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Before', right_on = 'OER')
oer_before_oer_sr_object = edge_construction(df1 = df_precedences_series, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'After', right_on = 'OER')

concept_belongs_class_subject = edge_construction(df1 = df_classes, df2 = unique_concept_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Concept', right_on = 'Concept')
concept_belongs_class_object = edge_construction(df1 = df_classes, df2 = unique_class_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Class', right_on = 'Class')

oer_covers_concept = torch.stack([oer_covers_concept_subject, oer_covers_concept_object], dim = 0).long()
oer_covers_concept_rev = torch.stack([oer_covers_concept_object, oer_covers_concept_subject], dim = 0).long()
oer_before_oer_ep = torch.stack([oer_before_oer_ep_subject, oer_before_oer_ep_object], dim = 0).long()
oer_before_oer_sr = torch.stack([oer_before_oer_sr_subject, oer_before_oer_sr_object], dim = 0).long()
concept_belongs_class = torch.stack([concept_belongs_class_subject, concept_belongs_class_object], dim = 0).long()
concept_belongs_class_rev = torch.stack([concept_belongs_class_object, concept_belongs_class_subject], dim = 0).long()
print(oer_covers_concept.shape)
print(oer_covers_concept_rev.shape)
print(oer_before_oer_ep.shape)
print(oer_before_oer_sr.shape)
print(concept_belongs_class.shape)
print(concept_belongs_class_rev.shape)

torch.Size([2, 24347])
torch.Size([2, 24347])
torch.Size([2, 2928])
torch.Size([2, 1407])
torch.Size([2, 72048])
torch.Size([2, 72048])


In [4]:
import fasttext

model_fasttext = fasttext.load_model("cc.en.300.bin")



In [5]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

sentence_embeddings = []
for sentences in df_chapters.Text.values:
    word_embs = []
    sentences = re.sub(r'[^\w\s]', '', sentences)
    words = word_tokenize(sentences)
    words = [w for w in words if not w.lower() in stop_words]
    for word in words:
        word_embedding = model_fasttext.get_word_vector(word)
        word_embs.append(word_embedding)
    if word_embs:
        word_embs = sum(word_embs) / len(word_embs)
    else:
        # Handle the case when none of the words are in the model's vocabulary
        word_embs = None
    sentence_embeddings.append(word_embs)

v1 = sentence_embeddings[0]
v2 = sentence_embeddings[1]
cos_sim = np.dot(v1, v2) / (np.linalg.norm(v1, 2) * np.linalg.norm(v2, 2))
print(cos_sim)
print(len(sentence_embeddings))

0.94540006
4407


In [6]:
i = 0
chapters_embeddings_tmp = {}
chapters_r = range(len(df_chapters['Cid'].unique()))
chapters_embeddings_bis = np.zeros(shape=(len(chapters_r), 768))
for r in chapters_r:
    chapters_embeddings_tmp[r] = [float(f) for f in sentence_embeddings[r]]
    for a in range(len(chapters_embeddings_tmp[r])):
            chapters_embeddings_bis[i][a] = chapters_embeddings_tmp[r][a]
    i += 1
chapters_embeddings = torch.from_numpy(chapters_embeddings_bis).to(torch.float32)

In [7]:
#chapters_embeddings_tmp = {}
concepts_embeddings_tmp = {} 
classes_embeddings_tmp = {}

#chapters_r = range(len(df_chapters['Cid'].unique()))
concepts_c = range(len(df_concepts['Concept'].unique()))
classes_c = range(len(df_classes['Class'].unique()))

#chapters_embeddings = np.zeros(shape=(len(chapters_r), 768))
concepts_embeddings = np.zeros(shape=(len(concepts_c), 768))
classes_embeddings = np.zeros(shape=(len(classes_c), 768))


'''i = 0
for r in chapters_r:
    chapters_embeddings_tmp[r] = list(filter(None, df_chapters_embeddings['BERT'][r].strip("[]\n").replace("'","").split(" ")))
    chapters_embeddings_tmp[r] = [float(f) for f in chapters_embeddings_tmp[r]]
    for a in range(len(chapters_embeddings_tmp[r])):
            chapters_embeddings[i][a] = chapters_embeddings_tmp[r][a]
    i += 1'''

i = 0
for r in concepts_c:
    concepts_embeddings_tmp[r] = list(filter(None, df_concepts_embeddings['BERT'][r].strip("[]\n").replace("'","").split(" ")))
    concepts_embeddings_tmp[r] = [float(f) for f in concepts_embeddings_tmp[r]]
    for a in range(len(concepts_embeddings_tmp[r])):
            concepts_embeddings[i][a] = concepts_embeddings_tmp[r][a]
    i += 1   

i = 0
for r in classes_c:
    classes_embeddings_tmp[r] = list(filter(None, df_classes_embeddings['BERT'][r].strip("[]\n").replace("'","").split(" ")))
    classes_embeddings_tmp[r] = [float(f) for f in classes_embeddings_tmp[r]]
    for a in range(len(classes_embeddings_tmp[r])):
            classes_embeddings[i][a] = classes_embeddings_tmp[r][a]
    i += 1

#chapters_embeddings = torch.from_numpy(chapters_embeddings).to(torch.float32)
concepts_embeddings = torch.from_numpy(concepts_embeddings).to(torch.float32)
classes_embeddings = torch.from_numpy(classes_embeddings).to(torch.float32)

In [8]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()
data['OER'].node_id = torch.tensor(unique_oer_id['mappedID'].values)
data['OER'].x = chapters_embeddings
data['Concept'].node_id = torch.tensor(unique_concept_id['mappedID'].values)
data['Concept'].x = concepts_embeddings
data['Class'].node_id = torch.tensor(unique_class_id['mappedID'].values)
data['Class'].x = classes_embeddings
data['OER', 'covers', 'Concept'].edge_index = oer_covers_concept
data['Concept', 'rev_covers', 'OER'].edge_index = oer_covers_concept_rev

data['OER', 'covers', 'Concept'].edge_attr = oer_covers_concept_pr
print(oer_before_oer_ep.shape)
data['OER', 'before_sr', 'OER'].edge_index = oer_before_oer_sr
data['OER', 'before_ep', 'OER'].edge_index = oer_before_oer_ep
data['Concept', 'belongs', 'Class'].edge_index = concept_belongs_class
data['Class', 'rev_belongs', 'Concept'].edge_index = concept_belongs_class_rev

#data = T.ToUndirected()(data)
data.validate()
print(data)

torch.Size([2, 2928])
HeteroData(
  [1mOER[0m={
    node_id=[4407],
    x=[4407, 768]
  },
  [1mConcept[0m={
    node_id=[7970],
    x=[7970, 768]
  },
  [1mClass[0m={
    node_id=[302],
    x=[302, 768]
  },
  [1m(OER, covers, Concept)[0m={
    edge_index=[2, 24347],
    edge_attr=[24732]
  },
  [1m(Concept, rev_covers, OER)[0m={ edge_index=[2, 24347] },
  [1m(OER, before_sr, OER)[0m={ edge_index=[2, 1407] },
  [1m(OER, before_ep, OER)[0m={ edge_index=[2, 2928] },
  [1m(Concept, belongs, Class)[0m={ edge_index=[2, 72048] },
  [1m(Class, rev_belongs, Concept)[0m={ edge_index=[2, 72048] }
)


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import random

def seed_everything(seed=0):                                                  
       random.seed(seed)                                                            
       torch.manual_seed(seed)                                                      
       torch.cuda.manual_seed_all(seed)                                             
       np.random.seed(seed)                                                         
       os.environ['PYTHONHASHSEED'] = str(seed)                                     
       torch.backends.cudnn.deterministic = True                                    
       torch.backends.cudnn.benchmark = False



In [10]:
agnostic = False
if agnostic:
    num_val = 0.5
    num_test = 0.5
else:
    num_val = 0.1
    num_test = 0.1
seed_everything()
transform = T.RandomLinkSplit(
    num_val = num_val,
    num_test = num_test,
    disjoint_train_ratio = 0.0,
    neg_sampling_ratio = 0.8,
    add_negative_train_samples = True,
    edge_types=('OER', 'before_sr', 'OER')
)

train_data, val_data, test_data = transform(data)
print(f'{len(train_data["OER", "before_sr", "OER"].edge_label.detach().numpy())}\t Edges for training')
print(f'{len(val_data["OER", "before_sr", "OER"].edge_label.detach().numpy())}\t Edges for validation')
print(f'{len(test_data["OER", "before_sr", "OER"].edge_label.detach().numpy())}\t Edges for testing')
print(train_data["OER", "before_sr", "OER"].edge_label_index[0][:5])
print(val_data["OER", "before_sr", "OER"].edge_label_index[0][:5])
print(test_data["OER", "before_sr", "OER"].edge_label_index[0][:5])
print(train_data["OER", "before_sr", "OER"].edge_label_index[1][:5])
print(val_data["OER", "before_sr", "OER"].edge_label_index[1][:5])
print(test_data["OER", "before_sr", "OER"].edge_label_index[1][:5])

2028	 Edges for training
252	 Edges for validation
252	 Edges for testing
tensor([1247, 2703, 4364, 4040, 3726])
tensor([1706,  134, 2279, 3486, 3509])
tensor([3269, 2685,  776, 3197, 2997])
tensor([1248, 2704, 4365, 4041, 3727])
tensor([1707,  135, 2280, 3487, 3510])
tensor([3270, 2686,  777, 3198, 2998])


In [11]:
seed_everything()
cross_val_data = {}
cross_val_data["OER", "before_sr", "OER"] = {}
print(len(train_data["OER", "before_sr", "OER"].edge_label_index[0]) + 
      len(val_data["OER", "before_sr", "OER"].edge_label_index[0]) +
      len(test_data["OER", "before_sr", "OER"].edge_label_index[0]))

cross_val_data["OER", "before_sr", "OER"]["edge_label"] = torch.cat(
    [train_data["OER", "before_sr", "OER"].edge_label, 
     val_data["OER", "before_sr", "OER"].edge_label,
     test_data["OER", "before_sr", "OER"].edge_label], 
    dim = 0).long()
cross_val_data["OER", "before_sr", "OER"]["edge_label_index"] = torch.cat(
    [train_data["OER", "before_sr", "OER"].edge_label_index, 
     val_data["OER", "before_sr", "OER"].edge_label_index,
     test_data["OER", "before_sr", "OER"].edge_label_index], 
    dim = 1).long()
cross_val_data["OER", "before_sr", "OER"]["edge_index"] = torch.cat(
    [train_data["OER", "before_sr", "OER"].edge_index, 
     val_data["OER", "before_sr", "OER"].edge_index,
     test_data["OER", "before_sr", "OER"].edge_index],
    dim = 1).long()
print(len(cross_val_data["OER", "before_sr", "OER"]["edge_label"]))
print(len(cross_val_data["OER", "before_sr", "OER"]["edge_label_index"][0]))
print(len(cross_val_data["OER", "before_sr", "OER"]["edge_index"][0]))

2532
2532
2532
3521


In [12]:
seed_everything()
num_samples = len(cross_val_data["OER", "before_sr", "OER"]["edge_label"])
shuffled_index = np.arange(num_samples)
np.random.shuffle(shuffled_index)
cross_val_data["OER", "before_sr", "OER"]["edge_label_index"][0] = cross_val_data["OER", "before_sr", "OER"]["edge_label_index"][0][shuffled_index]
cross_val_data["OER", "before_sr", "OER"]["edge_label_index"][1] = cross_val_data["OER", "before_sr", "OER"]["edge_label_index"][1][shuffled_index]
cross_val_data["OER", "before_sr", "OER"]["edge_label"] = cross_val_data["OER", "before_sr", "OER"]["edge_label"][shuffled_index]
print(cross_val_data["OER", "before_sr", "OER"]["edge_label_index"][0])
print(cross_val_data["OER", "before_sr", "OER"]["edge_label_index"][1])
print(cross_val_data["OER", "before_sr", "OER"]["edge_label"])

tensor([  54, 3196, 3431,  ..., 3746, 2287, 3773])
tensor([1322, 4084, 3432,  ..., 3747, 2288, 3159])
tensor([0, 0, 1,  ..., 1, 1, 0])


In [13]:
cv = 5
chunk_size = int(len(cross_val_data["OER", "before_sr", "OER"]["edge_label"]) / cv)
cross_val_chunks = []
for n in range(cv):
    cross_val_chunk = {}
    cross_val_chunk["OER", "before_sr", "OER"] = {}
    cross_val_chunk["OER", "before_sr", "OER"]["edge_label"] = {}
    begin = n * chunk_size
    if n == cv - 1:
        end = len(cross_val_data["OER", "before_sr", "OER"]["edge_label"])
    else :
        end = (n+1) * chunk_size
    cross_val_chunk["OER", "before_sr", "OER"]["edge_label"] = cross_val_data["OER", "before_sr", "OER"]["edge_label"][begin : end]
    cross_val_chunk["OER", "before_sr", "OER"]["edge_label_index"] = []
    cross_val_chunk["OER", "before_sr", "OER"]["edge_label_index"].append(cross_val_data["OER", "before_sr", "OER"]["edge_label_index"][0][begin : end])
    cross_val_chunk["OER", "before_sr", "OER"]["edge_label_index"].append(cross_val_data["OER", "before_sr", "OER"]["edge_label_index"][1][begin : end])
    cross_val_chunks.append(cross_val_chunk)
for n in range(cv):
    print(len(cross_val_chunks[n]["OER", "before_sr", "OER"]["edge_label"]))
    print(len(cross_val_chunks[n]["OER", "before_sr", "OER"]["edge_label_index"][0]))
    print(len(cross_val_chunks[n]["OER", "before_sr", "OER"]["edge_label_index"][1]))
    print("------------------------------")

506
506
506
------------------------------
506
506
506
------------------------------
506
506
506
------------------------------
506
506
506
------------------------------
508
508
508
------------------------------


In [14]:
cross_val_data_train = []
cross_val_data_test = []
for n in range(cv) :
    cross_val_data_train_chunk = {}
    cross_val_data_train_chunk["OER", "before_sr", "OER"] = {}
    cross_val_data_train_chunk["OER", "before_sr", "OER"]["edge_label"] = torch.cat(
    [cross_val_chunks[(n+1)%cv]["OER", "before_sr", "OER"]["edge_label"], 
     cross_val_chunks[(n+2)%cv]["OER", "before_sr", "OER"]["edge_label"],
     cross_val_chunks[(n+3)%cv]["OER", "before_sr", "OER"]["edge_label"],
     cross_val_chunks[(n+4)%cv]["OER", "before_sr", "OER"]["edge_label"]
     ], 
    dim = 0).long()
    cross_val_data_test_chunk = {}
    cross_val_data_test_chunk["OER", "before_sr", "OER"] = {}
    cross_val_data_test_chunk["OER", "before_sr", "OER"]["edge_label"] = cross_val_chunks[n]["OER", "before_sr", "OER"]["edge_label"]

    cross_val_data_train_chunk["OER", "before_sr", "OER"]["edge_label_index"] = []
    cross_val_data_train_chunk["OER", "before_sr", "OER"]["edge_label_index"].append(torch.cat(
    [cross_val_chunks[(n+1)%cv]["OER", "before_sr", "OER"]["edge_label_index"][0], 
     cross_val_chunks[(n+2)%cv]["OER", "before_sr", "OER"]["edge_label_index"][0],
     cross_val_chunks[(n+3)%cv]["OER", "before_sr", "OER"]["edge_label_index"][0],
     cross_val_chunks[(n+4)%cv]["OER", "before_sr", "OER"]["edge_label_index"][0]
     ], 
    dim = 0).long())
    cross_val_data_train_chunk["OER", "before_sr", "OER"]["edge_label_index"].append(torch.cat(
    [cross_val_chunks[(n+1)%cv]["OER", "before_sr", "OER"]["edge_label_index"][1], 
     cross_val_chunks[(n+2)%cv]["OER", "before_sr", "OER"]["edge_label_index"][1],
     cross_val_chunks[(n+3)%cv]["OER", "before_sr", "OER"]["edge_label_index"][1],
     cross_val_chunks[(n+4)%cv]["OER", "before_sr", "OER"]["edge_label_index"][1]
     ], 
    dim = 0).long())
    cross_val_data_test_chunk["OER", "before_sr", "OER"]["edge_label_index"] = []
    cross_val_data_test_chunk["OER", "before_sr", "OER"]["edge_label_index"].append(
        cross_val_chunks[n]["OER", "before_sr", "OER"]["edge_label_index"][0])
    cross_val_data_test_chunk["OER", "before_sr", "OER"]["edge_label_index"].append(
        cross_val_chunks[n]["OER", "before_sr", "OER"]["edge_label_index"][1])
    
    cross_val_data_train.append(cross_val_data_train_chunk)
    cross_val_data_test.append(cross_val_data_test_chunk)

len(cross_val_data_train)

5

In [15]:
hidden_channels = [16, 32, 64, 128]
num_layers = [1, 2, 4, 6]
epochs = [50, 100, 150, 200]
learning_rates = [0.001, 0.01, 0.1]
entity_features = 768

parameters = [{
    "epochs" : e,
    "hidden_channels" : h,
    "num_layers" : n,
    "lr" : l,
    "entity_features" : entity_features
} for h in hidden_channels
    for l in learning_rates
    for n in num_layers
    for e in epochs]

assert len(parameters) == len(hidden_channels) * len(num_layers) * len(epochs) * len(learning_rates), "ERROR constructing parameters variable"
print("Number of parameter combinations : ", len(parameters))

Number of parameter combinations :  192


In [16]:
hidden_channels_selected = 32
entity_features = 768
num_layers_selected = 6
epochs_selected = 300 #300 the best
learning_rates_selected = 0.001

selected_params = [{
    'epochs': epochs_selected, 
    'hidden_channels': hidden_channels_selected, 
    'num_layers': num_layers_selected, #8 is too much => generated NaN values in node attributes
    'lr': learning_rates_selected, 
    'entity_features': entity_features
}]

In [17]:
X_train_index = []
X_train = []
X_test_index = []
X_test = []
Y_train = []
Y_test = []
for n in range(cv):
    X_train_index.append(cross_val_data_train[n]["OER", "before_sr", "OER"]["edge_label_index"])
    X_train.append(np.concatenate([chapters_embeddings[X_train_index[n][0]],chapters_embeddings[X_train_index[n][1]]], axis=1))
    Y_train.append(cross_val_data_train[n]["OER", "before_sr", "OER"]["edge_label"])
    X_test_index.append(cross_val_data_test[n]["OER", "before_sr", "OER"]["edge_label_index"])
    X_test.append(np.concatenate([chapters_embeddings[X_test_index[n][0]],chapters_embeddings[X_test_index[n][1]]], axis=1))
    Y_test.append(cross_val_data_test[n]["OER", "before_sr", "OER"]["edge_label"])

print(X_train[0].shape, Y_train[0].shape)
print(X_test[0].shape, Y_test[0].shape)

(2026, 1536) torch.Size([2026])
(506, 1536) torch.Size([506])


In [24]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from itertools import product
from sklearn.svm import SVC
import pandas as pd

def classify_cv_predefined(X_train, Y_train, X_test, Y_test, cv):

    df_results = pd.DataFrame()
    for n in range(cv):
        
        X_train_batch = X_train[n]
        Y_train_batch = Y_train[n]
        X_test_batch = X_test[n]
        Y_test_batch = Y_test[n]
        
        '''print("Linear Regression :")
        results = {}
        logistic_regression = LogisticRegression(max_iter=300).fit(X_train_batch, Y_train_batch)
        Y_pred = logistic_regression.predict(X_test_batch)
        accuracy = accuracy_score(Y_test_batch, Y_pred)
        results['Model'] = "Linear Regression"
        results['Accuracy'] = accuracy
        results['Params'] = "None"
        df_results = pd.concat([df_results, pd.DataFrame([results])], ignore_index = True)'''

        print("SVM :")
        param_svm = {
            'C': [1, 10, 100], 
            'gamma': [0.1, 0.01, 0.001], 
            'kernel': ['linear']
        }
        param_svm = list(product(*param_svm.values()))
        for params in param_svm:
            results = {}
            C, gamma, kernel = params
            svm = make_pipeline(StandardScaler(), SVC(kernel = 'linear')).fit(X_train_batch, Y_train_batch)
            Y_pred = svm.predict(X_test_batch)
            accuracy = accuracy_score(Y_test_batch, Y_pred)
            results['Model'] = "SVM"
            results['Accuracy'] = accuracy
            results['Params'] = str(params)
            df_results = pd.concat([df_results, pd.DataFrame([results])], ignore_index = True)

        '''print("RBF :")
        param_rbf = {
            'C': [1, 10, 100], 
            'gamma': [0.1, 0.01, 0.001], 
            'kernel': ['rbf']
        }
        param_rbf = list(product(*param_rbf.values()))
        for params in param_rbf:
            results = {}
            C, gamma, kernel = params
            rbf = make_pipeline(StandardScaler(), SVC(gamma = gamma, C = C, kernel = kernel)).fit(X_train_batch, Y_train_batch)
            Y_pred = rbf.predict(X_test_batch)
            accuracy = accuracy_score(Y_test_batch, Y_pred)
            results['Model'] = "RBF"
            results['Accuracy'] = accuracy
            results['Params'] = str(params)
            df_results = pd.concat([df_results, pd.DataFrame([results])], ignore_index = True)'''

        '''print("Random Forest :")
        param_random_forest = {
            'random_state' : [0],
            'max_depth': [1, 2, 5, 10], 
            'n_estimators' : [50, 100, 150, 200],
            'criterion': ['gini', 'entropy', 'log_loss']            
        }
        param_random_forest = list(product(*param_random_forest.values()))
        for params in param_random_forest:
            results = {}
            random_state, max_depth, n_estimators, criterion = params
            rf = RandomForestClassifier(n_estimators = n_estimators, criterion = criterion, random_state = random_state, max_depth = max_depth ).fit(X_train_batch, Y_train_batch)
            Y_pred = rf.predict(X_test_batch)
            accuracy = accuracy_score(Y_test_batch, Y_pred)
            results['Model'] = "RF"
            results['Accuracy'] = accuracy
            results['Params'] = str(params)
            df_results = pd.concat([df_results, pd.DataFrame([results])], ignore_index = True)'''

    return df_results

In [25]:
#from classification import classify_cv_predefined

df_fasttext = classify_cv_predefined(X_train, Y_train, X_test, Y_test, cv)

SVM :
SVM :
SVM :
SVM :
SVM :


In [29]:
df_fasttext[(df_fasttext['Model'] == "SVM")].sort_values(by = ['Accuracy'], ascending = False).head(20)

Unnamed: 0,Model,Accuracy,Params
44,SVM,0.614173,"(100, 0.001, 'linear')"
43,SVM,0.614173,"(100, 0.01, 'linear')"
42,SVM,0.614173,"(100, 0.1, 'linear')"
41,SVM,0.614173,"(10, 0.001, 'linear')"
40,SVM,0.614173,"(10, 0.01, 'linear')"
39,SVM,0.614173,"(10, 0.1, 'linear')"
38,SVM,0.614173,"(1, 0.001, 'linear')"
37,SVM,0.614173,"(1, 0.01, 'linear')"
36,SVM,0.614173,"(1, 0.1, 'linear')"
31,SVM,0.600791,"(10, 0.01, 'linear')"


In [32]:
len(df_fasttext)

5

In [31]:
df_fasttext_best = df_fasttext[(df_fasttext['Model'] == "SVM") & (df_fasttext['Params'] == "(100, 0.001, 'linear')")]
print(round(df_fasttext_best["Accuracy"].values.min(), 2))
print(round(df_fasttext_best["Accuracy"].values.mean(), 2))
print(round(df_fasttext_best["Accuracy"].values.max(), 2))

0.57
0.59
0.61
