### Introduction : 

The code creates events gzip jsonl files for rankings and interactions based on input data. The gzip files are created separately for rankings/interactions at a monthly level.  

### Import libraries:

In [65]:
import pandas as pd 
import numpy as np 
import os as os
from datetime import datetime
import json
import gzip
import jsonlines
import pickle

### File locations :  

In [76]:
## Location to pick up data from 
folder_loc = "D:/Samagra/KO/Ama_krushi_data/"

## location to save all created files 
content_folder =  "D:/Samagra/KO/Recommendations engine/IVRS transformations/"

save_ranking_interactions =  "D:/Samagra/KO/Recommendations engine/IVRS transformations/interactions_rankings/"

## file name for the gzip with both ranking and interactions: 

gzip_filename_interactions = 'interaction.jsonl.gz'

gzip_filename_ranking = 'ranking.jsonl.gz'

# gzip_filename = 'ranking_interaction_Jan_Feb.jsonl.gz'

## file name for content metdata:  
content_md_filename=  'content_metadata.csv'

eng_ratio_cutoff = 0.858034321372855

## finding the list of files
file_list = os.listdir(folder_loc)
file_list.reverse()

### User defined functions:

In [66]:
def writeall_jsonl_gz(filename, payload ):
    with gzip.open(filename, 'wb') as fp:
        json_writer = jsonlines.Writer(fp)
        json_writer.write_all(payload)

In [69]:
def intial_cleaning_df(df1): 
    df1['pushcall_id'] =  df1['pushcall_id'].str.lower()
    df1 = df1.loc[~df1.pushcall_length.isna(),:]
    df1 =  pd.merge(df1,content_md[['pushcall_id']],how='inner')
    df1['eng_ratio'] = df1['duration']/df1['pushcall_length']
    df1['eng_ratio']= df1.eng_ratio.astype('float')
    return(df1)

In [70]:
def create_interactions_df(df1):
    df1.loc[:,'liked_call'] = 0
    df1.loc[df1.eng_ratio > eng_ratio_cutoff ,'liked_call'] = 1
    numbers_who_liked = np.unique(df1.loc[df1.liked_call == 1,'number' ].values)
    numbers_who_liked_df = pd.DataFrame(numbers_who_liked,columns =['number'])
    df2 = pd.merge(df1,numbers_who_liked_df)
    df2 = df2.sort_values('call_datetime').reset_index(drop= True)
    df2['month'] = df2['call_datetime'].str[5:7].astype('int')
    df2['pushcall_id'] = df2['pushcall_id'].str.lower()
    df3 = df2.groupby(['number'])['pushcall_id'].agg([(lambda x: ','.join(map(str, x)))]).applymap(lambda elements: ','.join(set(elements.split(',')))).reset_index().rename(columns = {'<lambda>':'Pushcall_ids_rankings'})
    rankings_interactions_df =  pd.merge(df2.loc[df2.liked_call ==1 ,:],df3)
    return(rankings_interactions_df)

In [71]:
def create_interactions_dict(rankings_interactions_df):
    interactions_df = rankings_interactions_df[['number','call_datetime','pushcall_id']].rename(columns = {'pushcall_id':'item', 'call_datetime':'timestamp','number':'user'})
    interactions_df['user'] =   interactions_df['user'].astype(str).str[0:10]
    interactions_df['fields'] = np.empty((len(interactions_df), 0)).tolist()
    interactions_df['timestamp'] = ((pd.to_datetime(interactions_df['timestamp']) -  pd.Timestamp("1970-01-01"))/ pd.Timedelta('1s'))* 1000
    interactions_df['timestamp']=  pd.to_numeric(interactions_df['timestamp'].astype('str').str[0:13]).astype(object)
    interactions_df['id']=  (file_month + interactions_df.index.astype('str')).astype('str')
    interactions_df['ranking'] = interactions_df['id']
    interactions_df['tenant']= 'default'
    interactions_df['type'] =  'click'
    interactions_df['session'] = interactions_df['user']
    interactions_df['event'] = 'interaction'
    interactions_df =  interactions_df[['event','fields','id','item','ranking','session','tenant','timestamp','type','user']]
    interactions_dict = interactions_df.to_dict('records')
    return(interactions_dict)

In [72]:
def create_rankings_dict(rankings_interactions_df):
    rankings_df = rankings_interactions_df[['number','call_datetime','Pushcall_ids_rankings']].copy().rename(columns = {'Pushcall_ids_rankings':'items', 'call_datetime':'timestamp','number':'user'})
    rankings_df['user'] =  rankings_df['user'].astype(str).str[0:10]
    rankings_df['id'] = (file_month + rankings_df.index.astype('str')).astype('str')
    rankings_df['event']= 'ranking'
    rankings_df['session']= rankings_df['user']
    rankings_df['tenant']= 'default'
    rankings_df['fields'] = np.empty((len(rankings_df), 0)).tolist()
    rankings_df['timestamp'] = ((pd.to_datetime(rankings_df['timestamp']) -  pd.Timestamp("1970-01-01"))/ pd.Timedelta('1s'))* 1000
    rankings_df['timestamp'] =  rankings_df['timestamp'] - 10 * 1000
    rankings_df['timestamp'] = pd.to_numeric(rankings_df['timestamp'].astype('str').str[0:13]).astype(object)
    rankings_df['items']  =("[{'id':'" + rankings_df['items'].str.replace(',',"""','relevancy':0},{'id':'""") + "','relevancy':0}]" ).str.replace("'",'"')
    rankings_df['items']  = rankings_df['items'].apply(json.loads)
    rankings_dict =  rankings_df.to_dict('records')
    return(rankings_dict)

### Reading data : 

In [68]:
## reading the content metadata file 
content_md = pd.read_csv(content_folder + content_md_filename )

### Creating the event files: 

Pulling the interactions from the folder and creating the events into the json gzip files.  

In [73]:
files_df = pd.DataFrame(file_list, columns = ['file_name'])
files_df['month'] = files_df['file_name'].str[0:5]

In [77]:
for file_month in  files_df.month.unique():
    month_file_names =  files_df.loc[files_df.month == file_month,'file_name' ].values
    for file_name in month_file_names:
        all_files_df= pd.DataFrame( columns = ['call_datetime', 'pushcall_id', 'number', 'duration', 'ratings', 'pushcall_length'])
        print(file_name)
        df = pd.read_csv(folder_loc+file_name)
        df = df.loc[~df.pushcall_length.isna(),:]
        df = df[['call_datetime', 'pushcall_id', 'number', 'duration', 'ratings', 'pushcall_length']]
        
        all_files_df =  pd.concat((all_files_df,df),axis = 0,ignore_index= True)
    df1 =  intial_cleaning_df(all_files_df)
    rankings_interactions_df = create_interactions_df(df1)
    interactions_dict = create_interactions_dict(rankings_interactions_df)
    rankings_dict = create_rankings_dict(rankings_interactions_df)
    writeall_jsonl_gz(save_ranking_interactions + file_month+'_' + gzip_filename_ranking, rankings_dict )
    writeall_jsonl_gz(save_ranking_interactions + file_month+'_' + gzip_filename_interactions, interactions_dict )
    print( 'written ' +  file_month)

22-04-30_April_22.csv


  df = pd.read_csv(folder_loc+file_name)


written 22-04
22-03-31_March_20-31.csv


  df = pd.read_csv(folder_loc+file_name)


22-03-20_March_10-20.csv
22-03-10_March_1-10.csv
written 22-03
22-02-30_Feb_22_3.csv
22-02-20_Feb_22_2.csv
22-02-10_Feb_22_1.csv
written 22-02
22-01-30_Jan_20-30.csv
22-01-20_Jan_11-20.csv


  df = pd.read_csv(folder_loc+file_name)


22-01-11_Jan_1-10.csv


  df = pd.read_csv(folder_loc+file_name)


written 22-01
21-12-31_Dec_21_21-31.csv
21-12-31_Dec_21_11-20.csv
21-12-10_Dec_21_1.csv


  df = pd.read_csv(folder_loc+file_name)


written 21-12
21-11-30_Nov_21_21-30.csv
21-11-20_Nov_21_11-20.csv
21-11-10_Nov_21_01-10.csv
written 21-11
21-10-30_Oct_21_3.csv
21-10-20_Oct_21_2.csv
21-10-10_Oct_21_1.csv
written 21-10
21-09-30_Sept_21_3.csv
21-09-20_Sept_21_2.csv
21-09-10_Sept_21_1.csv
written 21-09
21-08-30_August_21_3.csv


  df = pd.read_csv(folder_loc+file_name)


21-08-20_August_21_2.csv
21-08-10_August_21_1.csv
written 21-08
