In [1]:
############################################
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

############################################
# #!pip install  pillow

############################################
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import os
from sys import getsizeof
import sklearn
from sklearn import preprocessing
from sklearn import cluster , metrics
from sklearn.cluster import AgglomerativeClustering
from sklearn import manifold, decomposition
from scipy.cluster.hierarchy import dendrogram, linkage
import time

############################################
import seaborn as sns
#%matplotlib inline
#%matplotlib notebook
sns.set()

###########################################
import PIL
from PIL import ImageFilter
import keras

###########################################
#!pip install opencv-python
import cv2

##########################################
PATH_IMAGE =  "/media/brice_kengni_zanguim/Samsung_T5/Téléchargements/photos"

In [2]:
def get_nested_value(d, key):
    """Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.
    Example:
        d = {
            'a': {
                'b': 2,
                'c': 3,
                },
        }
        key = 'a.b'
        will return: 2
    """
    if '.' not in key:
        if key not in d:
            return None
        return d[key]
    base_key, sub_key = key.split('.', 1)
    if base_key not in d:
        return None
    sub_dict = d[base_key]
    return get_nested_value(sub_dict, sub_key)

def get_row(line_contents, column_names):
    """Return a csv compatible row given column names and a dict."""
    row = []
    for column_name in column_names:
        line_value = get_nested_value(
                        line_contents,
                        column_name,
                        )
        if isinstance(line_value, bytes):
            row.append('{0}'.format(line_value.encode('utf-8')))
        elif isinstance( line_value , (float , int) ) :
            row.append('{0}'.format(line_value))
        elif line_value is not None:
            row.append('{0}'.format(line_value.encode('utf-8')))
        else:
            row.append('')
    
    #row = [ bytes(i,"UTF-8") for i in row ]
    return row

def read_and_write_file(json_file_path, csv_file_path, column_names):
    """Read in the json dataset file and write it out to a csv file, given the column names."""
    
    # convert the variable "column_name" to a byte type
    #column_names = [ bytes(i,"UTF-8") for i in column_names ]
    
    with open(csv_file_path, 'w+') as fout:  #  wb+
        csv_file = csv.writer(fout)
        csv_file.writerow(column_names)
        with open(json_file_path,'r', encoding="UTF-8", errors='surrogateescape') as fin :
            for line in fin:
                line_contents = json.loads(line)
                if isinstance( line_contents, bytes): line_contents = line_contents.decode()
                                    
                csv_file.writerow(get_row(line_contents, column_names))
        
def get_superset_of_column_names_from_file(json_file_path):
    """Read in the json dataset file and return the superset of column names."""
    column_names = set()
    with open(json_file_path , encoding='utf-8') as fin:
        for line in fin:
            line_contents = json.loads(line)
            column_names.update( set( get_column_names(line_contents).keys()) )
    #column_names = [ i.decode() for i in column_names ]
    return column_names

def get_column_names(line_contents, parent_key=''):
    """Return a list of flattened key names given a dict.
    Example:
        line_contents = {
            'a': {
                'b': 2,
                'c': 3,
                },
        }
        will return: ['a.b', 'a.c']
    These will be the column names for the eventual csv file.
    """
    column_names = []
    for k, v in line_contents.items():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, collections.MutableMapping):
            column_names.extend(
                    get_column_names(v, column_name).items()
                    )
        else:
            column_names.append((column_name, v))
    return dict(column_names)

def sampling_data( file_name_path = None, datas =None , proportion= 1 ) :
    if file_name_path :
        n = sum(1 for line in open(file_name_path))-1
        s = int(n*proportion/100)  
        skip = sorted(random.sample(range(1, n+1), n-s))
        
        return pd.read_csv(file_name_path, skiprows=skip)
    else :
        data = datas.copy()
        n = data.shape[0]
        s = int(n*proportion/100)  
        skip = sorted(random.sample(list(data.index) ,n-s))
        
        return data.drop( index = skip )
        
def bytes_to_str( data , columns ) :
    data = data.copy()
    for col in columns :
        for idx in data.index :
            data.loc[idx , col] = data.loc[idx , col][2:-1]
    return data

def document_frequence ( texts ) :
    """ Renvoie la frequence des mots/tokens dans un text/document"""
    
    if type(texts) != list :
        texts = nltk.tokenize.word_tokenize(texts)

    return nltk.FreqDist(  texts ) 

def corpus_frequence ( data , col) :
    """ Renvoie la frequence des mots/tokens dans tout le corpus"""
    data = data.copy()
    
    # Vérifications sur les entrées
    if col not in data.columns :
        print( f"La feature `{col}` que vous avez fournie n'est pas valide" )
        return
    
    freq_tot = nltk.Counter()
    for text in data[col] :
        freq_tot += document_frequence(texts = text)
        
    return freq_tot

def traitement_de_texte(data, col, langage="english", to_minuscule=True, rm_ponctuation=True, rm_number =True, tokenize_sentence=True, \
                        rm_stop_words=True, lemmatize_sentence= True, stemming_sentence=True, remove_most_frequent = (False , 200) ,\
                        remove_less_frequent = (True , 10) ,token_min_len = 3, complete_stopworld =True ) :
    
    data = data.copy()
    
    # Vérifications sur les entrées
    if col not in data.columns :
        print( f"La feature `{col}` que vous avez fournie n'est pas valide" )
        return
    if langage not in ["english", "french"] :
        print(f"la variable `langage` prend uniquement les valeurs `english` ou `french`")
        return
    
    # Utilitaires NLTK
    nltk.download('omw-1.4')
    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('wordnet')
    if langage == "english" :
        stemer_obj = nltk.PorterStemmer()
        from nltk.stem import WordNetLemmatizer
        nltk.download('words')
        words = set(nltk.corpus.words.words())
        lemmatizer = WordNetLemmatizer()
    else :
        #!pip install FrenchLefffLemmatizer
        #!pip install "E:\Mon bureau\Mes investissements\Openclassroom\Formation Ingénieur IA\Projet 6\P6_Brice_KENGNI_ZANGUIM\FrenchLefffLemmatizer"
        from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
        words = set(line.strip() for line in open('dictionnaire_Fr.txt',"r+", encoding="UTF-8"))
        stemer_obj = nltk.stem.snowball.FrenchStemmer()
        lemmatizer = FrenchLefffLemmatizer()
    
    
    stopwords = set()
    stopwords.update(nltk.corpus.stopwords.words(langage))
    if langage == "english" and complete_stopworld : stopwords.update(["in","then","the","then","that","this","there","we","of","they","but","where","so","only","got",\
                                               "was","say","use","away","need","get","line","want","one","two","give","see","came","let","way",
                                                "come","also","going","done","put","got","went","could","dont","didnt","first","since","made",\
                                               "wasnt","said","would","make","take","every","else","told","even","eat","new","tell","know","thing",\
                                                "thats","used"])
    
    # Passage du texte en minuscule
    if to_minuscule : data[col] = data[col].apply(lambda sentence : sentence.lower())
    # Gestion de la ponctuation
    if rm_ponctuation : data[col] = data[col].apply(lambda sentence : "".join( [i for i in sentence if i not in string.punctuation] ) )
    # Suppression des chiffres
    if rm_number : data[col] = data[col].apply(lambda sentence : "".join(i for i in sentence if not i.isdigit() ) )
    # Tokenisation
    if tokenize_sentence : data[col] = data[col].apply(lambda sentence : nltk.tokenize.word_tokenize(sentence))
    # Suppression des stopwords
    if rm_stop_words : data[col] = data[col].apply(lambda sentence : [i for i in sentence if i not in stopwords] )
    # Lemmatisation
    if lemmatize_sentence : 
        data[col] = data[col].apply(lambda sentence : [lemmatizer.lemmatize(w) for w in sentence ] )
    elif stemming_sentence :
        # Stemming
        data[col] = data[col].apply(lambda sentence : [stemer_obj.stem(w) for w in sentence ] )
    # Suppression de token de taille inférieure à  `token_min_len`
    data[col] = data[col].apply(lambda sentence : [ i for i in sentence if len(i) >= token_min_len])    
    
    # Ajouter les mots les plus frequents dans la liste de stopwords et suppression du corpus
    corpus_freq = corpus_frequence(data , col)
    if remove_most_frequent[0] :
        mc = list(zip(*corpus_freq.most_common( remove_most_frequent[1] ) ))[0] 
        # Suppression
        data[col] = data[col].apply(lambda sentence : [i for i in sentence if i not in mc] )
    # Supprimer les mots les plus rares
    if remove_less_frequent[0] : 
        serie_freq = pd.Series(corpus_freq)
        lc = np.array( serie_freq.index[ serie_freq <= remove_less_frequent[1]*data.shape[0]/100 ] )
        # Suppression 
        data[col] = data[col].apply(lambda sentence : [i for i in sentence if i not in lc] )
    
    # Nombre de Tokens et nombre de tokens uniques
    data["Nb_Tokens_uniques"] = data[col].apply( lambda sentence : np.unique(sentence).size ) 
    data["Nb_Tokens"] = data[col].apply( lambda sentence : len(sentence))
    # Reconstruction des phrases 
    data[col] = data[col].apply(lambda sentence : " ".join(w for w in sentence if w in words or not w.isalpha()) )
    
    return data

def hyerarchical_clustering ( data ,link_criterios = "ward", profondeur = 3 ) :
    data = data.copy()
    linked = linkage( data , link_criterios )
    labelList = data.columns
    plt.figure(figsize=(12, 8))
    dendrogram( linked , truncate_mode = 'level' , p=profondeur , orientation='top', distance_sort='descending', show_leaf_counts = True  )
    plt.show()

def World_cloud_show( data , fig_size = (8,5), max_wd = 150, do_mask = True ) :
    if do_mask : 
        mask = np.array(Image.open("cloud.jpg"))
        mask[mask > 1] = 255
        x = wordcloud.WordCloud(width = fig_size[0]*100, height = fig_size[1]*100, background_color ='white',colormap="plasma", min_font_size=3,\
                            max_words=max_wd, mask = mask).generate(" ".join( data.text))
    else :
        x = wordcloud.WordCloud(width = fig_size[0]*100, height = fig_size[1]*100, background_color ='white',colormap="plasma", min_font_size=3,\
                            max_words=max_wd, ).generate(" ".join( data.text))

    # plot the WordCloud image                      
    plt.figure(figsize = fig_size, facecolor = None)
    plt.imshow(x)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()

def TSNE_plot (data , clustering_model , method='exact' , transp = 0.4, perplx = 10) :
    data = data.copy()
    ########## COULEURS

    labl_hc = clustering_model.labels_
    couleurs  = np.random.choice( [ "red", "orange", "green", "blue", "blueviolet","black","brown" ,"navy" ,"purple", "magenta", "gold"] ,\
                                 np.unique( labl_hc ).size , replace = False )
    
    couleur_hc = pd.Series(labl_hc).apply( lambda x : couleurs[x])

    tsne_hc = sklearn.manifold.TSNE(n_components = 3, perplexity = perplx, n_iter = 1500, n_iter_without_progress = 200, init = 'pca', \
                                    n_jobs= os.cpu_count() , method= method)
    tsne_hc.fit( data )

    fig , axes = plt.subplots( 1, 3 ,figsize = (22, 7) )
    plt.title("XY" , size = 15)
    sns.scatterplot(tsne_hc.embedding_[:,0], tsne_hc.embedding_[:,1],  c = couleur_hc, alpha=transp, s=120, ax=axes[0] )
    plt.title("XZ" , size = 15)
    sns.scatterplot(tsne_hc.embedding_[:,0], tsne_hc.embedding_[:,2],  c = couleur_hc, alpha=transp, s=120, ax=axes[1] )
    plt.title("YZ" , size = 15)
    sns.scatterplot(tsne_hc.embedding_[:,1], tsne_hc.embedding_[:,2],  c = couleur_hc, alpha=transp, s=120, ax=axes[2] )

def clustering_model_choice ( data , n_clus = 2, modl = "km") :
    data = data.copy()
    if modl == "km" :
        model = sklearn.cluster.KMeans(n_clusters=n_clus,  max_iter=400 , n_init= 20)   
    elif modl == "hc" :
        model = AgglomerativeClustering(n_clusters=n_clus, affinity='euclidean', linkage='ward')
    model.fit( data )
    return model

def show_feat_get_descrip ( image, n_feat = 300 , show_feat = True) :
    image = np.asarray( image )
    sift = cv2.SIFT_create(n_feat )
    kp, des = sift.detectAndCompute( image  , None)
    if show_feat :
        img=cv2.drawKeypoints( image ,kp,  image  )
        plt.figure(figsize=(9,6))
        plt.imshow(img)
        plt.grid(False)
        plt.axis('off')
        plt.show()
    
    return des

def histogram_d_image( descripteurs_toutes_les_images, clustering_model = "KMEANS" , prop_clust = 1.3):
    # Nombre de clusters des descripteurs
    nombre_de_cluster_features = int(prop_clust*np.sqrt( np.concatenate( descripteurs_toutes_les_images  ).shape[0] ) )
    
    if clustering_model == "KMEANS" : 
        clustering_model = cluster.MiniBatchKMeans(n_clusters = nombre_de_cluster_features, init_size = 2*nombre_de_cluster_features,  max_iter = 200, n_init = 40)
        clustering_model.fit(  np.concatenate( descripteurs_toutes_les_images  )  )
    
    # Matrice des histogrammes des images: shape = (  Nombre d'images , Nombres de clusters )
    hist_vectors=[]
    #t_i = time.time()
    for num , img_descrip in enumerate( descripteurs_toutes_les_images ) :
        hist = np.zeros(len(clustering_model.cluster_centers_))
        # Prédiction des clusters de chaque
        clusters_des_descripteurs = clustering_model.predict(img_descrip)
    
        if len(img_descrip) == 0 :
            print(f"L'image numéro {num} n'a pas de descriptieur")
            continue
        else :
            for numero_de_cluster in clusters_des_descripteurs:
                hist[ numero_de_cluster ] += 1.0
        hist_vectors.append( hist/len(img_descrip) )
        #print(time.time() - t_i)
    return np.asarray( hist_vectors )

def img_show( imag ): 
    plt.figure(figsize=(8,8))
    plt.imshow(imag)
    plt.grid(False)
    plt.axis('off')
    
    plt.show()
    
def img_hist( img , cumul = False ): 
    plt.figure(figsize=(7,5))
    sns.histplot( np.array( img ).ravel(), cumulative= cumul, color='blue')
    plt.xlabel("Niveau de gris", size=15)
    plt.show()

def img_repixels ( img , Min = 0 , Max = 255 , auto = False):
    if auto :
        return  PIL.ImageOps.autocontrast(img)
    else :
        mat = np.array(img).astype(int)

        if mat.shape[2] >= 2 :
            for i in range( mat.shape[2] ) :
                mat[:,:,i] = Min + (mat[:,:,i] - mat[:,:,i].min())*( Max - Min )/( mat[:,:,i].max() - mat[:,:,i].min())
        else :
            mat = Min + (mat - mat.min())*( Max - Min )/( mat.max() - mat.min())

        return PIL.Image.fromarray(  mat.astype('uint8')  )
    

def get_train_test_index( data , label = "label", nombre=(200,200) ) :
    data = data.copy()
    lignes_train, lignes_test = [], []
    
    if np.asarray(nombre).size == 1 :
        nombre = np.asarray(nombre)
        if nombre < 1. :
            nombre = nombre*data.shape[0]
            nombre = nombre.astype(int)

        lignes_train =  random.sample( sorted(data.index.values), nombre )
        lignes_test = data.drop(index = lignes_train ).index.values
        
    elif np.asarray(nombre).size == 2 :
        for cat in data[label].unique() : 
            sub_data = data[ data[label] == cat]
            for i in random.sample( sorted(sub_data.index.values), nombre[0] ) :
                lignes_train.append(i)

        data.drop(index = lignes_train , inplace =True)
        for cat in data[label].unique() : 
            sub_data = data[ data[label] == cat]
            for i in random.sample( sorted(sub_data.index.values), nombre[1] ) :
                lignes_test.append(i)
    
    return sorted( lignes_train ) , sorted( lignes_test )


def load_images ( data, label="photo_id", path = PATH_IMAGE ) :
    data = data.copy()
    data["images"] = np.nan

    for idx in data.index :
        try :
            # Ouvertures des photos
            data["images"][idx]  = PIL.Image.open( f"{path}/{data.loc[idx,label]}" ) 
        except :
            continue

    data.dropna(inplace=True)
    return data

def process_images ( data , label="images" , process_type = "RGEC", resolution =( 300,300),gauss_size=2): 
    data = data.copy()
    
    if "E" in process_type :
        # Uniformisation de la distributuion de pixels
        data["images_processes"] = data[label].apply(lambda image : PIL.ImageOps.equalize(PIL.ImageOps.equalize(image ) ) ) 
    
    if "G" in process_type :
        #picts["images_process"] = picts.images_process.apply(lambda image : img_repixels( image,  20, 220) )
        data["images_processes"] = data.images_processes.apply(lambda image : image.filter( PIL.ImageFilter.GaussianBlur(gauss_size) )  )
    if "C" in process_type :
        # Mise en contraste de l'image
        data["images_processes"] = data.images_processes.apply(lambda image : PIL.ImageOps.autocontrast( image )  )
    
    if "R" in process_type :
        data["images_processes"] = data.images_processes.apply( lambda img : np.asarray(img) )
        data['images_processes'] = data.images_processes.apply( lambda img :  cv2.resize(img , resolution, interpolation=cv2.INTER_AREA ) )
    
    data['images_process'] = data["images_processes"]
    data.drop( columns=["images_processes"] ,inplace = True)
    return data

