# Importation des bibliothéques

In [2]:
import networkx as nx
import random
import string
from node2vec import Node2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier

# Recherche dela composante

In [3]:
import os
# !find /usr/local -name "pyspark"
os.environ["SPARK_HOME"] = "/home/anyes/spark"
os.environ["JAVA_HOME"] = "/usr"

In [4]:
# Main imports
import findspark
from pyspark.sql import SparkSession
from pyspark import SparkConf

# for dataframe and udf
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import *

import pandas as pd


# initialise environment variables for spark
findspark.init()

# Start spark session
# --------------------------
def start_spark():
  local = "local[*]"
  appName = "PLDAC"

  gf = "graphframes:graphframes:0.8.3-spark3.5-s_2.12"

  configLocale = SparkConf().setAppName(appName).setMaster(local).\
  set("spark.executor.memory", "6G").\
  set("spark.driver.memory","6G").\
  set("spark.sql.catalogImplementation","in-memory").\
  set("spark.jars.packages", gf)

  spark = SparkSession.builder.config(conf = configLocale).getOrCreate()
  sc = spark.sparkContext
  sc.setLogLevel("ERROR")

  spark.conf.set("spark.sql.autoBroadcastJoinThreshold","-1")

  # Adjust the query execution environment to the size of the cluster (4 cores)
  spark.conf.set("spark.sql.shuffle.partitions","4")
  print("session started, its id is ", sc.applicationId)
  return spark
spark = start_spark()

24/04/23 23:58:43 WARN Utils: Your hostname, anyes-Latitude-5480 resolves to a loopback address: 127.0.1.1; using 192.168.1.45 instead (on interface wlp2s0)
24/04/23 23:58:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/anyes/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/anyes/.ivy2/cache
The jars for the packages stored in: /home/anyes/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0d6a75ef-1254-4790-8676-444e36026062;1.0
	confs: [default]
	found graphframes#graphframes;0.8.3-spark3.5-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 787ms :: artifacts dl 22ms
	:: modules in use:
	graphframes#graphframes;0.8.3-spark3.5-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	----------------------------------

session started, its id is  local-1713909531059


In [5]:
# Initialisez une session Spark
spark = SparkSession.builder.appName("Proteines_graph").getOrCreate()

parquet_folder = "./local/data/component_9/graph"

# Récuperer tous les fichiers Parquet compressé avec Snappy
parquet_files = parquet_folder + "/*.snappy.parquet"

#Création du data frame
df = spark.read.format("parquet").option("compression", "snappy").load(parquet_files)

# Affichez les 10 premiére ligne du DataFrame
df.show(10)

                                                                                

[Stage 2:>                                                          (0 + 1) / 1]

In [5]:
df_grouped = df.groupBy("component_id").agg(count("*").alias("count"))
df_grouped.show(10)

[Stage 3:>                                                          (0 + 4) / 6]

# Classification

In [None]:
G = None #Création d'un graph avec networkx

# Générer les embeddings de noeuds avec Node2Vec
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# Obtenir les embeddings de noeuds appris
node_embeddings = model.wv

labeled_nodes = None # La liste des noeuds qui ont des labels
one_hot_labels = None # Convertir les étiquettes en vecteurs One-Hot Encoding
total_label_count = None # Nombre total de labels (pfam)

# Créer les caractéristiques X et les étiquettes y pour la classification
X = np.array([node_embeddings.get_vector(str(node)) for node in labeled_nodes])
y = np.array([one_hot_labels.get(node, [0] * total_label_count) for node in labeled_nodes])

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Création du classificateur MLPClassifier
classifier = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=100, activation='relu', solver='adam', random_state=42)

classifier.fit(X_train, y_train)

# Prédire les étiquettes pour les données de test
y_pred = classifier.predict(X_test)

# Calculer l'exactitude
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Grid search 

In [None]:
from sklearn.model_selection import GridSearchCV

# Définition des hyperparamètres à rechercher
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'max_iter': [100, 200]
}

# Création de l'objet GridSearchCV
grid_search = GridSearchCV(MLPClassifier(random_state=42), param_grid, cv=5)

# Recherche des meilleurs hyperparamètres
grid_search.fit(X_train, y_train)

# Affichage des meilleurs hyperparamètres trouvés
print("Meilleurs hyperparamètres trouvés :")
print(grid_search.best_params_)

# Obtention du meilleur modèle
best_model = grid_search.best_estimator_

# Prédire les étiquettes pour les données de test avec le meilleur modèle
y_pred = best_model.predict(X_test)

# Calculer l'exactitude avec le meilleur modèle
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy avec le meilleur modèle:", accuracy)
