# LCM Minner tool

In [2]:
import pandas as pd 
import numpy as np
from datetime import date,datetime
import csv
import re
from multiprocessing import Pool
from functools import partial
from sklearn import preprocessing
from google.colab import files

!ls 

ModuleNotFoundError: No module named 'google.colab'

### 1. Une fonction qui prend les arguments suivants:
jeu de données (contenant timestamp)
fréquence de temps choisie (année, mois, jour) et qui retourne le jeu de données partitionné selon la granularité temporelle sous la forme de plusieurs fichiers, un par couple (partition, période de temps)

In [7]:
def split_dataset(input_file,frequency):
    """Split initial dataset to multiple files according to frequency
    
    Yields
    ------
    str
        names of partitions files
    """
    df = pd.read_csv(input_file)
    
    df["timestamp"] = pd.to_datetime(df["timestamp"],unit="s").dt.to_period(frequency)
    df.set_index(["timestamp"],inplace=True)
    df.movieId = df.movieId.astype(str)
      
    for period,period_df in df.groupby(pd.Grouper(freq = frequency)):
        output = period_df.groupby("userId").movieId.apply(lambda x: " ".join(x)).to_frame().reset_index()
        output.movieId = output.userId.astype(str)+" "+ output.movieId
        output.drop(["userId"],inplace=True,axis=1)
        output.to_csv(str(period),header=None,index=None)
        output.movieId.apply(lambda x: x.split(" ")[0]).to_frame().astype(str).to_csv("permut"+str(period),header=None,index=False)        
        yield period
        

### 2. Une fonction qui prend les arguments suivants:
- un couple (partition, période de temps)
- la valeur du support
<br>Cette fonction fait appel à pmr.pylcm et retourne des groupes fréquents selon leur support
Group description:[set of items] (support) Group content:[set of users]

In [8]:
def format_output(results,period):
    """Format lcm output to:
    Group description:[set of items] (support) Group content:[Sorted set of users] 
    """
    item_sets,supports,groups,periods = [],[],[],[]
    for i,j in zip(results[3::2],results[4::2]):
        if '(' not in i:
            break
        *items,support = re.findall("([0-9]+)+",i)
        items = ','.join(e for e in items)
        group = j[1:]
        item_sets.append(items)
        supports.append(support)
        periods.append(period)
        groups.append(group)
    return item_sets,supports,groups,periods

def run_lcm(input_file=None,support=None,output=""):
    """ Runs LCM  and return the  result formated with format_output"""
    result = !./lcm CfI {input_file} {support} {str(input_file)+output}
    if "there is no frequent item" in str(result):
        return ''
    return str(input_file)+output

### 3. Une fonction qui prend en argument:
Group description:[set of items] (support) Group content:[set of users]
et qui retourne pour chaque user, l’ensemble des groupes auxquels il appartient sous la forme:
(user_id, group_id, période de temps)


In [9]:
def users_groups(df,output="result.csv"):
    # split each user from the group to a new line : [[user_id,group,period],]
    res = pd.DataFrame(df.users_ids.str.split(",").tolist(),index=[df.period,df.groups]).stack()
    res = res.reset_index([0,"groups","period"])
    res.columns = ["period","group_id","user_id"]
    res = res[['user_id','group_id','period']]
    res.to_csv(output,index=None)
    return res

### Mining function 

In [10]:
# def multithread_lcm(input_file='data/ml-latest-small/ratings.csv',frequency="Y"):

def singlethread_lcm(input_file='data/ml-latest-small/ratings.csv',frequency="Y",support=20 ):
    """ Run LCM with a single thread and output the result to output file
    Output structure :
        user_id,group_id,period
    """
    output = []
    for partition in split_dataset(input_file,frequency):
        # run LCM and get the items, supports and groups of each frequent itemset found 
        item_sets,supports,groups = run_lcm(partition,frequency,support)
        # get the affectation of each user from the frequence itemsets groups 
        users_affectation = users_groups(groups=groups,period=str(partition))
        output+=users_affectation
    return output

def multithread_lcm(input_file,frequency,support):
    f = partial(run_lcm,support=support)
    p = Pool(8)
    res  = p.map(f,split_dataset(input_file,frequency))
    p.close()
    p.join()
    return res

def permut(x,data):
    output= ""
    for i in x.split(' '):
        if i is "":
            continue
        output+= str(data[0][int(i)])+' '
    return output


# Run the code 

### LCM minning in multithread 
#### Output : file name  of each period having results

In [11]:
input_file = "ratings.csv"
frequency = "Y"
support = 30

In [12]:
output = multithread_lcm(input_file,frequency,support)
output = [i for i in  output if i is not ""]
output 

['1996', '2000', '2015', '2017']

### combine all files in one dataframe

In [15]:
outputs = []
combined_csv = pd.DataFrame()
permutations = pd.DataFrame()

for i in output:
    
    df = pd.read_csv(i,header=None)
    df["period"] = i 
    combined_csv= pd.concat([combined_csv,df])

    df = pd.read_csv("permut"+i,header=None)
    df["period"] = i 
    permutations= pd.concat([permutations,df])
    
def form_group(x,permutations):
    output = ''
    for i in x.split(" "):
        if i is not "":
            output+=str(permutations[0][int(i)])+","
    return output

### replace transactions ids with users_ids

In [16]:
df1,df2= combined_csv[0::2].reset_index(drop=True),combined_csv[1::2].reset_index(drop=True)
df1 = df1[0].str.split(r'\((.*)\)',expand=True)
df1.drop([2],axis=1,inplace = True)
df1.columns= ["itemset","support"]
df2.columns = ["users_ids","period"]
df = pd.concat([df1,df2],axis=1)
df["groups"] = df["users_ids"]
for (period,i) in df.groupby("period"):
    df["groups"][i.index]= i["groups"].apply(lambda x:form_group(x,permutations[permutations.period==period]))
df["users_ids"] = df["groups"]
df.head()

Unnamed: 0,itemset,support,users_ids,period,groups
0,,97,"5,6,8,14,26,31,35,37,38,40,43,46,54,56,58,81,9...",1996,"5,6,8,14,26,31,35,37,38,40,43,46,54,56,58,81,9..."
1,592,83,"5,6,8,14,26,35,37,38,40,43,46,54,56,58,81,94,9...",1996,"5,6,8,14,26,35,37,38,40,43,46,54,56,58,81,94,9..."
2,380,79,"5,6,8,26,37,38,40,43,46,56,58,81,94,99,102,107...",1996,"5,6,8,26,37,38,40,43,46,56,58,81,94,99,102,107..."
3,380 592,72,"5,6,8,26,37,38,40,43,46,56,58,81,94,99,102,109...",1996,"5,6,8,26,37,38,40,43,46,56,58,81,94,99,102,109..."
4,296,77,"5,6,8,14,26,37,38,40,43,54,56,58,81,94,99,102,...",1996,"5,6,8,14,26,37,38,40,43,54,56,58,81,94,99,102,..."


### Label encoder to encode groups 

In [17]:
le = preprocessing.LabelEncoder()
df["groups"]= le.fit_transform(df["groups"])
df.head()

Unnamed: 0,itemset,support,users_ids,period,groups
0,,97,"5,6,8,14,26,31,35,37,38,40,43,46,54,56,58,81,9...",1996,5079
1,592,83,"5,6,8,14,26,35,37,38,40,43,46,54,56,58,81,94,9...",1996,5080
2,380,79,"5,6,8,26,37,38,40,43,46,56,58,81,94,99,102,107...",1996,5314
3,380 592,72,"5,6,8,26,37,38,40,43,46,56,58,81,94,99,102,109...",1996,5316
4,296,77,"5,6,8,14,26,37,38,40,43,54,56,58,81,94,99,102,...",1996,5085


In [18]:
df.groups.nunique(),df.groups.size


(40112, 40112)

In [19]:
output_file ='{}-{}-all.dat'.format(frequency,support)
df.to_csv(output_file)
!ls

1996  2004  2012  LCM_Minner.ipynb  permut2003	permut2011  ratings.csv
1997  2005  2013  permut1996	    permut2004	permut2012  Y-30-all.dat
1998  2006  2014  permut1997	    permut2005	permut2013
1999  2007  2015  permut1998	    permut2006	permut2014
2000  2008  2016  permut1999	    permut2007	permut2015
2001  2009  2017  permut2000	    permut2008	permut2016
2002  2010  2018  permut2001	    permut2009	permut2017
2003  2011  lcm   permut2002	    permut2010	permut2018


### Split users_ids 

In [21]:
output_file ='{}-{}-users.dat'.format(frequency,support)

In [22]:
users_groups(df,output=output_file).head()

Unnamed: 0,user_id,group_id,period
0,5,5079,1996
1,6,5079,1996
2,8,5079,1996
3,14,5079,1996
4,26,5079,1996


In [23]:
!rm permut*
!rm 1*
!rm 2*
!ls


lcm  LCM_Minner.ipynb  ratings.csv  Y-30-all.dat  Y-30-users.dat
