# Preliminaries

In [1]:
import copy
import itertools
from collections import defaultdict
from operator import itemgetter
import matplotlib.pyplot as plt
import pandas as pd

# PrefixSpan

### Project a sequence

In [None]:
# implementation prise de https://github.com/sequenceanalysis/sequenceanalysis.github.io/blob/master/notebooks/part2.ipynb
"""
Projects a sequence according to a given prefix, as done in PrefixSpan

Args:
    sequence: the sequence the projection is built from
    prefix: the prefix that is searched for in the sequence
    newEvent: if set to True, the first itemset is ignored
Returns:
    If the sequence does not contain the prefix, then None.
    Otherwise, a new sequence starting from the position of the prefix, including the itemset that includes the prefix
"""
def projectSequence(sequence, prefix, newEvent):
    result = None
    for i, itemset in enumerate(sequence):
        if result is None:
            if (not newEvent) or i > 0:
                if (all(x in itemset for x in prefix)):
                    result = [list(itemset)]
        else:
            result.append(copy.copy(itemset))
    return result

### Project a dataset

In [None]:
"""
Projects a dataset according to a given prefix, as done in PrefixSpan

Args:
    dataset: the dataset the projection is built from
    prefix: the prefix that is searched for in the sequence
    newEvent: if set to True, the first itemset is ignored
Returns:
    A (potentially empty) list of sequences
"""
def projectDatabase(dataset, prefix, newEvent):
    projectedDB = []
    for sequence in dataset:
        seqProjected = projectSequence(sequence, prefix, newEvent)
        if not seqProjected is None:
            projectedDB.append(seqProjected)
    return projectedDB

### The main algorithm

#### Some more utility functions:

In [None]:
"""
Generates a list of all items that are contained in a dataset
"""
def generateItems(dataset):
    return sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))

"""
Computes a defaultdict that maps each item in the dataset to its support
"""
def generateItemSupports(dataset, ignoreFirstEvent=False, prefix=[]):
    result = defaultdict(int)
    for sequence in dataset:
        if ignoreFirstEvent:
            sequence = sequence[1:]
        cooccurringItems = set()
        for itemset in sequence:
            if all(x in itemset for x in prefix):
                for item in itemset:
                    if not item in prefix:
                        cooccurringItems.add(item)
        for item in cooccurringItems:
            result [item] += 1
    return sorted(result.items())

#### Finally, the algorithm:

In [None]:
"""
The PrefixSpan algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport

Args:
    dataset: A list of sequences, for which the frequent (sub-)sequences are computed
    minSupport: The minimum support that makes a sequence frequent
Returns:
    A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence
"""
def prefixSpan(dataset, minSupport):
    result = []
    itemCounts = generateItemSupports(dataset)
    for item, count in itemCounts:
        if count >= minSupport:
            newPrefix = [[item]]
            result.append((newPrefix, count))
            result.extend(prefixSpanInternal(projectDatabase(dataset, [item], False), minSupport, newPrefix))
    return result

def prefixSpanInternal(dataset, minSupport, prevPrefixes=[]):
    result = []
    
    # Add a new item to the last element (==same time)
    itemCountSameEvent = generateItemSupports(dataset, False, prefix=prevPrefixes[-1])
    for item, count in itemCountSameEvent:
        if (count >= minSupport) and item > prevPrefixes[-1][-1]:
            newPrefix = copy.deepcopy(prevPrefixes)
            newPrefix[-1].append(item)
            result.append((newPrefix, count))
            result.extend(prefixSpanInternal(projectDatabase(dataset, newPrefix[-1], False), minSupport, newPrefix))
        
    # Add a new event to the prefix
    itemCountSubsequentEvents = generateItemSupports(dataset, True)
    for item, count in itemCountSubsequentEvents:
        if count >= minSupport:
            newPrefix = copy.deepcopy(prevPrefixes)
            newPrefix.append([item])
            result.append((newPrefix, count))
            result.extend(prefixSpanInternal(projectDatabase(dataset, [item], True), minSupport, newPrefix))
    return result

# Filter for closed and maximal patterns
### Closed patterns

In [None]:
"""
Given a list of all frequent sequences and their counts, compute the set of closed frequent sequence (as a list)
This is only a very simplistic (naive) implementation for demonstration purposes!
"""
def filterClosed(result):
    for supersequence, countSeq in copy.deepcopy(result):
        for subsequence, countSubSeq in copy.deepcopy(result):
            if isSubsequence(supersequence, subsequence) and (countSeq == countSubSeq) and subsequence != supersequence:
                result.remove((subsequence, countSubSeq))

### Maximal sequences

In [None]:
"""
Given a list of all frequent sequences and their counts, compute the set of maximal frequent sequence (as a list)
This is only a very naive implementation for demonstration purposes!
"""
def filterMaximal(result):
    for supersequence, countSeq in copy.deepcopy(result):
        for subsequence, countSubSeq in copy.deepcopy(result):
            if isSubsequence (supersequence, subsequence) and subsequence != supersequence:
                result.remove((subsequence, countSubSeq)) 

# Application 

Convert this to the list of list of lists that we use as a dataformat

In [2]:
df = pd.read_csv(r'./moyen_data.csv', sep="\t")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
new_df = df[df['class']=='Aves']
new_df = new_df[['family','species', 'year', 'month', 'decimalLatitude', 'decimalLongitude', 'individualCount']]

### Regarder pour les espèce

In [None]:
def ps_for_year(df):
    resultat_final=[]
    #df_use = df[df['year']==annee]
    year = set(df['year'].tolist())
    #gen = list(set(df['species'].tolist()))
    
    for ye in year:
        data1 = df[df.year == ye]
        print('année :', ye)
        #print(data1)
        month = set(data1['month'].tolist())
        new_month = []
        count = False
        for elem in month:
            if pd.isnull(elem):
                count = True
            else:
                new_month.append(elem)
            if count == True:
                new_month.append(float('nan'))
        month = new_month
        big_list =[]
        for mo in month:
            data2 = data1[data1.month == mo]
            longitude = set(data2['decimalLongitude'].tolist())
            print('mois :', mo)
            for lon in longitude:
                data3 = data2[data2.decimalLongitude == lon]
                latitude = set(data3['decimalLatitude'].tolist())
                for la in latitude:  # d'abord refaire un for pour longitude puis latitude
                    lst = []#[0] * len(gen) #np.nan
                    final_df = data3[data3.decimalLatitude == la]
                    #print(final_df)
                    # num_obs = len(final_df['decimalLatitude'])
                    genre = set(final_df['species'].tolist())
                    for elem in genre:
                        if pd.isnull(elem) == False:
                        #index = gen.index(elem)
                        #lst[index]+=1
                            lst.append(elem)

                    big_list.append(lst)
        wikispeediaData=[]
        for seq in big_list:
            newSeq = []
            for item in seq:
                newSeq.append([item])
            wikispeediaData.append(newSeq)
        min_sup = len(wikispeediaData)*10/100
        res = prefixSpan (wikispeediaData, min_sup)
        resultat = []
        for elem in res:
            resultat.append(elem+(ye,))
        resultat_final.extend(resultat)
    return resultat_final

In [None]:
res = ps_for_year(new_df)

## Traitement des résultat

In [None]:
final =[] 
for i in range(len(res)):
    longueur = len(res[i][0])
    pattern = []
    for j in range(longueur):
        elem = res[i][0][j][0]
        pattern.append(elem)
        b = frozenset(pattern)
    c = (b, res[i][1], res[i][2])
    final.append(c)

In [None]:
df_prefix = pd.DataFrame(final, columns =['espece', 'compte', 'annee'])

In [None]:
ar = []
espece = set(df_prefix['espece'].tolist())
for element in espece:
    df_1 = df_prefix[df_prefix['espece']==element]
    list_annee = []
    list_count = []
    for el in df_1.itertuples():
        list_annee.append(el.annee)
        list_count.append(el.compte)
    list_pat = [element,df_1['compte'].sum() ,list_count, list_annee, len(set(list_annee))]
    ar.append(list_pat)
    

In [None]:
df_prefixSpan = pd.DataFrame(ar, columns =['espèce', 'compte_total','compte', 'année', 'nombre_annee'])

In [None]:
final_prefix = df_prefixSpan.sort_values(['nombre_annee', 'compte_total'], ascending = [False,False])

In [None]:
#final_prefix.to_csv('res_prefix.csv')

In [None]:
df_prefixSpan.sort_values(by = 'nombre_annee', ascending = [False])['compte_total'].tolist()

In [None]:
df_outte = df_prefixSpan[df_prefixSpan['espèce']== frozenset(['Alopochen aegyptiaca'])]

In [None]:
count_outte = df_outte['compte'].tolist()
annee_ouette = df_outte['année'].tolist()

In [None]:
plt.plot(annee_ouette[0],count_outte[0])
#plt.title("Nombre d'observation en fonction des années")
plt.xlabel('Années')
plt.ylabel("Nombre d'observation")
plt.savefig("obs_annee.png")
plt.show

## voir ratio nombre d'observation

In [None]:
year = set(df['year'].tolist())
dico_year = {}
for ye in year:
    data1 = df[df['year']==ye]
    dico_year[ye]=len(data1)

In [None]:
plt.plot(dico_year.keys(),dico_year.values())
plt.title("Nombre d'observation en fonction des années")
plt.xlabel('Années')
plt.ylabel("Nombre d'observation)
plt.show

In [None]:
new_dic = {}
count = 0
for key,value in dico_year.items():
    new_dic[key] = (count_outte[0][count]/value)*100
    count+=1

### pourcentage d'observation des ouette 

In [None]:
plt.plot(new_dic.keys(),new_dic.values())
plt.xlabel('Années')
plt.ylabel("Observation(%)")
plt.savefig("rat_annee.png")
plt.show

### nombre d'ouette observer chaque année

In [None]:
year = set(df['year'].tolist())
dico_count = {}
for ye in year:
    data1 = df[df['year']==ye]
    dico_count[ye]=len(data1)
    

In [None]:
year = set(new_df['year'].tolist())
new_new_df = new_df[new_df['species']=='Alopochen aegyptiaca']
moyen = new_new_df['individualCount'].sum()/len(new_new_df)
print('moyen : ', moyen)
dico_count_ou = {}
for ye in year:
    data1 = new_new_df[new_new_df['year']==ye]
    dico_count_ou[ye]=data1['individualCount'].sum()

In [None]:
plt.plot(dico_count_ou.keys(),dico_count_ou.values())
#plt.title("Ratio d'obseervation pour chaque année")
plt.xlabel('Années')
plt.ylabel("Nombre d'espèce")
#plt.savefig("rat_annee.png")
plt.show

## Évaluer général

In [None]:
tri_df = df_prefixSpan[df_prefixSpan['nombre_annee']>=3]
espece = tri_df['espèce'].tolist()

In [None]:
dico_croissance = {}
for elem in tri_df.itertuples():
    compte = elem.compte
    somme = 0
    count = 0
    for i in range(1, len(compte)):
        somme += (compte[i]-compte[i-1])/compte[i-1]
        count+=1
    moyen = somme/count
    dico_croissance[elem.espèce]=moyen

In [None]:
df_croissance = pd.DataFrame(dico_croissance.items(), columns=['espèce', 'croissance'])
#print(df_croissance[df_croissance['croissance']>0.1])
df_croissance.sort_values(by = 'croissance', ascending = [False])

In [None]:
year = set(df_outte['year'].tolist())
dico_year = {}
for ye in year:
    data1 = df_outte[df_outte['year']==ye]
    dico_year[ye]=len(data1)

In [None]:
def evaluation (df ,df_prefixSpan, espece):
    df_outte = df_prefixSpan[df_prefixSpan['espèce']== frozenset([espece])]
    count_ouette = df_outte['compte'].tolist()
    annee_ouette = df_outte['année'].tolist()
    plt.figure(1)
    plt.plot(annee_ouette[0], count_ouette[0])
    plt.xlabel('Temps (Années)')
    plt.ylabel("Observation")
    plt.savefig("obs_annee.png")
    plt.show()
    print('ratio')
    year = set(df['year'].tolist())
    dico_year = {}
    for ye in year:
        data1 = df[df['year']==ye]
        dico_year[ye]=len(data1)
    new_dic = {}
    count = 0
    for key,value in dico_year.items():
        new_dic[key] = (count_ouette[0][count]/value)*100
        count+=1
    plt.figure(2)
    plt.plot(new_dic.keys(),new_dic.values())
    #plt.title("Ratio d'observation pour chaque année")
    plt.xlabel('Temps (Années)')
    plt.ylabel("Observation(%)")
    plt.savefig("rat_annee.png")
    plt.show()

In [None]:
evaluation (new_df ,df_prefixSpan, 'Alopochen aegyptiaca')# Alopochen aegyptiaca 'Branta canadensis'