# LCM Miner tool

In [1]:
import pandas as pd 
import numpy as np
from datetime import date,datetime
import csv
import re
from multiprocessing import Pool
from functools import partial
from sklearn import preprocessing
from IPython.core import display as ICD
import matplotlib.pyplot as plt
import itertools

### 1. Une fonction qui prend les arguments suivants:
jeu de données (contenant timestamp)
fréquence de temps choisie (année, mois, jour) et qui retourne le jeu de données partitionné selon la granularité temporelle sous la forme de plusieurs fichiers, un par couple (partition, période de temps)

In [2]:
# TODO dateset_filter parameter not used
def split_dataset(input_file,frequency,dataset_filter):
    """Split initial dataset to multiple files according to frequency
    
    Yields
    ------
    str
        names of partitions files
    """
    df = pd.read_csv(input_file)
#     df.query(dataset_filter,inplace=True)
    df["timestamp"] = pd.to_datetime(df["timestamp"],unit="s").dt.to_period(frequency)
    df.set_index(["timestamp"],inplace=True)
    df.movieId = df.movieId.astype(str)
    
    for period,period_df in df.groupby(pd.Grouper(freq = frequency)):
        output = period_df.groupby("userId").movieId.apply(lambda x: " ".join(x)).to_frame().reset_index()
        output.movieId = output.userId.astype(str)+" "+ output.movieId
        output.drop(["userId"],inplace=True,axis=1)
        output.to_csv(str(period),header=None,index=None)
        yield period
        

### 2. Une fonction qui prend les arguments suivants:
- un couple (partition, période de temps)
- la valeur du support
<br>Cette fonction fait appel à pmr.pylcm et retourne des groupes fréquents selon leur support
Group description:[set of items] (support) Group content:[set of users]

In [3]:
def format_output(results,period):
    """Format lcm output to:
    Group description:[set of items] (support) Group content:[Sorted set of users] 
    """
    item_sets,supports,groups,periods = [],[],[],[]
    for i,j in zip(results[3::2],results[4::2]):
        if '(' not in i:
            break
        *items,support = re.findall("([0-9]+)+",i)
        items = ','.join(e for e in items)
        group = j[1:]
        item_sets.append(items)
        supports.append(support)
        periods.append(period)
        groups.append(group)
    return item_sets,supports,groups,periods



### 3. Une fonction qui prend en argument:
Group description:[set of items] (support) Group content:[set of users]
et qui retourne pour chaque user, l’ensemble des groupes auxquels il appartient sous la forme:
(user_id, group_id, période de temps)


In [4]:
def users_groups(df,output="result.csv"):
    # split each user from the group to a new line : [[user_id,group,period],]
    res = pd.DataFrame(df.users_ids.str.split(",").tolist(),index=[df.period,df.group]).stack()
    res = res.reset_index([0,"group","period"])
    res.columns = ["period","group_id","user_id"]
    res = res[['user_id','group_id','period']]
    return res

### Mining function 

In [266]:
def run_lcm(input_file=None,support=None,itemsets_size=[5,100]):
    """Runs LCM (Single Thread)  and return the  result formated with format_output
    
    Example for parameters : input_file='1999',support=6, itemsets_size=[5,100]
    Executed command :  $ ./lcm C_QI -l 5 -u 100 1999 6 -
    
    Preconfigured parameters:  
     C: enumerate closed frequent itemsets
     Q: output the frequency on the head of each itemset found,
     I: output ID's of transactions including each itemset; ID of a transaction is given by the number of line in which the transaction is written. The ID starts from 0.
     _: no output to standard output (including messages w.r.t. input data)
     -l,-u [num]: enumerate itemsets with size at least/most [num]  
   
    Output:
        Replace file having name input_file with the result : support,itemset,users
        if no itemset found the input_file is deleted and output is empty string ""
    """
    input_file= str(input_file)
    
    result = !./lcm C_QI -l {itemsets_size[0]} -u {itemsets_size[1]} {input_file} {support} -
    if "there is no frequent item" in str(result) or result == []:
        print("No itemset",input_file)
        try:
            !rm {str(input_file)}
        except:
            pass
        return ''
    
    # #transaction to user_id and replace input_file with result
    reformat_output(input_file,result)
    
    return input_file



def multithread_lcm(input_file,frequency,support,itemsets_size,dataset_filter):
    f = partial(run_lcm,support=support,itemsets_size=itemsets_size)
    p = Pool(8)
    res  = p.map(f,split_dataset(input_file,frequency,dataset_filter))
    p.close()
    p.join()
    return res

def permut(x,data):
    output= ""
    for i in x.split(' '):
        if i is "":
            continue
        output+= str(data[0][int(i)])+' '
    return output



def form_group(x,permutations):
    output = ''
    for i in x.split(" "):
        if i is not "":
            output+=str(permutations[0][int(i)])+","     
    return output[:-1]

def combine_outputs(files_names):
    """
    Output : Dataframe,Dataframe
        content of files in files_names in one dataframe
        content of permutations files associated in one dataframe 
    """
    combined_csv = pd.DataFrame()
    permutations = pd.DataFrame()

    for i in files_names:
        permut_file = "permut"+str(i)
        try:# when l is too small even if lcm find results, file is created by empty
            df = pd.read_csv(i,header=None)
        except:
            continue
        df["period"] = i 
        combined_csv= pd.concat([combined_csv,df])
        df = pd.read_csv(permut_file ,header=None)
        df["period"] = i 
        permutations= pd.concat([permutations,df])
    return combined_csv,permutations

def reformat_output(input_file,unformated_result):
    """
    Reformat default output of lcm : (support),itemset,#transations
    to a file having name:  input_file and structure : support|itemsets|users
    """
    result = []
    permutation = pd.read_csv(str(input_file),header=None)[0].apply(lambda x: x.split(" ")[0])
    for i,j in zip(unformated_result[0::2],unformated_result[1::2]):
        support,*itemset = i.split()
        itemset = j
        support = support[1:-1]
        users = " ".join([str(permutation[int(z)]) for z in j.split()])
        result.append(f'{support}|{itemset}|{users}')
    
    # Permut #transaction to user_id
    pd.DataFrame(result).to_csv(input_file,header=None,index=None)
    return result 

def encode_groups(df):
    """Label encoding from column users_ids in dataframe input"""
    le = preprocessing.LabelEncoder()
    df["group"]= le.fit_transform(df["users_ids"])

# Run the code 

In [6]:
def linear_closed_itemset_miner(input_file,frequency,support,itemsets_size,dataset_filter):
    output = multithread_lcm(input_file,frequency,support,itemsets_size,dataset_filter)
    res = [i for i in  output if i is not ""]
    df = pd.DataFrame()
    for i in res:
        if i is "":
            continue
        df = pd.concat([df,pd.read_csv(i,sep="|",header=None)],axis=0)
    df.columns = ["support","itemsets","users"]
    
    output_file = f'{frequency}-{support}-[{itemsets_size[0]}-{itemsets_size[1]}]-{dataset_filter}-groups.dat'
    df.to_csv(output_file)
    return df


In [19]:
#linear_closed_itemset_miner("ratings.csv","Y",50,[2,15],"")

In [None]:
dataset_filter = ""
for frequency in ["Y","6M","3M","2M","M"]:
    for support in [20,30,50,100,200,1000 ]:
        for itemsets_size in [[2,5],[2,10],[2,20],[20,50],[2,100]]:
            output_file = f'{frequency}-{support}-[{itemsets_size[0]}-{itemsets_size[1]}]-groups.dat'
            try:
                df = linear_closed_itemset_miner(input_file,frequency,support,itemsets_size,output_file,dataset_filter)
                print(output_file)
                ICD.display(df.group.value_counts().to_frame().head())
            except Exception as e:
                print("Exception For : ",output_file)
                print(e)

# 2nd Version Groups of users having same demographic properties

In [38]:
# TODO Groupby using ranges ex: age within ranges of size 10 
def dataset_property_split(df,frequency,properties):
    for period,i in df.groupby(pd.Grouper(freq = frequency)):
        for values,ii in i.groupby(properties):
            if len(properties)>1:
                values = '_'.join(str(z) for z in values)
            split_name = f"{period}_{values}"
            ii = ii.groupby('user_id')["movie_id"].apply(
                    lambda x: " ".join(str(z) for z in x)
            )
            ii.reindex(np.arange(ii.index.max())).fillna(',').to_csv(
                split_name,index=False,header=False,sep = '\t', quoting = csv.QUOTE_NONE, escapechar = ' '
            )        
            yield str(split_name)
        
def reformat_output(unformated_result):
    """
    Reformat default output of lcm  to a dataframe with structure : support,itemsets,users
    """
    output = pd.DataFrame([unformated_result[0::2],unformated_result[1::2]]).T
    output = pd.concat([output.drop(0,axis=1),output[0].str.split('\(([0-9]+)\)',expand=True).drop(0,axis=1)],axis=1)
    return output


def run_lcm(split_name,support,itemsets_size,output_file):
    """Runs LCM (Single Thread)  and return the  result formated with format_output
    
    Example for parameters : input_file='1999',support=6, itemsets_size=[5,100]
    Executed command :  $ ./lcm C_QI -l 5 -u 100 1999 6 -
    
    Preconfigured parameters:  
     C: enumerate closed frequent itemsets
     Q: output the frequency on the head of each itemset found,
     I: output ID's of transactions including each itemset; ID of a transaction is given by the number of line in which the transaction is written. The ID starts from 0.
     _: no output to standard output (including messages w.r.t. input data)
     -l,-u [num]: enumerate itemsets with size at least/most [num]  
   
    Output:
        Replace file having name input_file with the result : support,itemset,users
        if no itemset found the input_file is deleted and output is empty string ""
    """
    result = !./lcm C_QI -l {itemsets_size[0]} -u {itemsets_size[1]} {split_name} {support} -
    if "there is no frequent item" in str(result) or result == []:
        print("No itemset",split_name)
        return 
    print("Found ",len(result)/2," in",split_name )
    reformat_output(result).to_csv(output_file,header=False,index=None,mode="a")
    return split_name


def multithread_lcm(input_file,frequency,support,itemsets_size,properties,output_file):
    f = partial(run_lcm,support=support,itemsets_size=itemsets_size,output_file=output_file)
    p = Pool(8)
    res  = p.imap_unordered(f,dataset_property_split(input_file,frequency,properties))
    p.close()
    p.join()
    return res
                                              
def linear_closed_itemset_miner(df,frequency,support,itemsets_size,properties):
    output_file = f'{frequency}-{support}-[{itemsets_size[0]}-{itemsets_size[1]}]-[{"_".join(str(i) for i in properties)}]-groups.dat'
    try:
        !rm {output_file}
    except:
        pass
    a = multithread_lcm(df,frequency,support,itemsets_size,properties,output_file)
    a = [i for i in a if i is not None]

def age_class(age):
    age = np.int(age)
    if age<=12:
        return 0
    if age<=17:
        return 1
    if age<=24:
        return 2
    if age<=34:
        return 3
    if age<=44:
        return 4
    if age<44:
        return 5
    if age<=54:
        return 6
    if age<=74:
        return 7
    return 8

### Getting dataset ready
1. Merge users and ratigns dataset 
2. split ages into small ranges

In [40]:
# ratings dataframe 
df = pd.read_csv("datasets/ml-1m/ml-1m/ratings.dat",sep='::',header=None,engine='python')
df.columns = ["user_id","movie_id","rating","timestamp"]
df["timestamp"] = pd.to_datetime(df["timestamp"],unit="s").dt.to_period("T")

# Users dataframe 
users = pd.read_csv("datasets/ml-1m/ml-1m/users.dat",header=None,sep="::",engine="python")
users.columns = ["user_id","user_gender","user_age","user_occupation","user_zip_code"]
users.set_index("user_id").head()
# User age to ranges 
users.user_age = users.user_age.apply(lambda x : age_class(x))
# merge both
df = df.merge(users,on="user_id")
df.set_index(["timestamp"],inplace=True)
df.head()

Unnamed: 0_level_0,user_id,movie_id,rating,user_gender,user_age,user_occupation,user_zip_code
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-12-31 22:12,1,1193,5,F,0,10,48067
2000-12-31 22:35,1,661,3,F,0,10,48067
2000-12-31 22:32,1,914,3,F,0,10,48067
2000-12-31 22:04,1,3408,4,F,0,10,48067
2001-01-06 23:38,1,2355,5,F,0,10,48067


# Run Code 

In [None]:
input_file="ratings.csv"
frequencies =["7D","21D","M","2M","3M","Y"]
supports = [20,25,30,35,50,75,100,200,1000]
itemsets_sizes =  [[2,5],[2,10],[2,20],[10,20],[15,20],[2,100]]
properties = [["user_gender"],["user_age"],["user_zip_code"],["user_age","user_gender"],["user_gender","user_occupation"],["user_zip_code","user_age"],["user_zip_code","user_age","user_gender"]]
for i in itertools.product(frequencies,supports,itemsets_sizes,properties):
    linear_closed_itemset_miner(df,*i)

rm: impossible de supprimer '7D-20-[2-5]-[user_gender]-groups.dat': Aucun fichier ou dossier de ce type
No itemset 2000-04-25_F
Found  68.0  in 2000-04-25_M
No itemset 2000-05-02_F
Found  35.0  in 2000-05-02_M
No itemset 2000-05-09_F
Found  411.0  in 2000-05-09_M
Found  271.0  in 2000-05-16_M
No itemset 2000-05-23_F
Found  304.0  in 2000-05-23_M
No itemset 2000-05-30_F
No itemset 2000-05-16_F
Found  758.0  in 2000-05-30_M
No itemset 2000-06-06_F
Found  5.0  in 2000-06-06_M
Found  4.0  in 2000-06-13_M
Found  21.0  in 2000-06-20_M
No itemset 2000-06-27_F
No itemset 2000-06-13_F
No itemset 2000-06-20_F
Found  1.0  in 2000-07-04_F
Found  15.0  in 2000-07-11_M
Found  3183.0  in 2000-07-04_M
Found  1.0  in 2000-07-18_M
No itemset 2000-07-11_F
Found  345.0  in 2000-07-25_M
No itemset 2000-07-18_F
Found  94.0  in 2000-08-01_F
No itemset 2000-07-25_F
Found  2.0  in 2000-08-08_F
No itemset 2000-08-22_F
No itemset 2000-08-15_F
Found  19007.0  in 2000-06-27_M
Found  105.0  in 2000-08-29_M
No items

TypeError: 'int' object is not iterable

In [37]:
values

0