In [None]:
# Tested on Python 3.8.10
!pip install datasketch
!pip install PyYAML==5.3.1
!pip install pandas==2.0.2

In [3]:
# Importing libraries
import pickle
import random
import numpy as np
import pandas as pd

from yaml import CLoader as Loader, load
from datasketch import MinHash, MinHashLSHForest

from ml_pipeline.model import LSHGraph
from ml_pipeline.score import score_fn, get_extn
from ml_pipeline.utils import feat_imp, count_fn
from ml_pipeline.utils import conv_values, flatten_list

---

## Data Cleaning

In [4]:
# Load the configuration data from a YAML file
with open("../input/config.yaml") as stream:
    config = load(stream, Loader=Loader)


In [5]:
# Extract configuration values from the 'config' dictionary
label = config["label"]
n_perm = config["n_perm"]
thresh = config["thresh"]
id_col = config["id_col"]
features = config["features"]
data_path = config["data_path"]
seed_path = config["seed_path"]
extn_path = config["extn_path"]
list_cols = config["list_cols"]
model_path = config["model_path"]
count_path = config["count_path"]
clean_data_path = config["clean_data_path"]


In [6]:
# Read the data
data = pd.read_json(data_path)

In [7]:
# Reset index in data
data.reset_index(drop=True, inplace=True)

In [8]:
# Convert list to integers
for c in features:
    # Check if the column is not in list_cols
    if c not in list_cols:
        # Apply a function to each element in the column
        data[c] = data[c].apply(lambda x: x[0] if type(x) == list else None)


In [9]:
data.shape

(200000, 11)

In [10]:
# remove rows having a number of elements above a certain threshold for list columns
for c in list_cols:
    # Create a temporary "count" column to store the number of elements in the list
    data["count"] = data[c].apply(lambda x: len(x) if type(x) == list else 0)
    
    # Filter the DataFrame to keep rows where the "count" is less than or equal to the threshold
    data = data[data["count"] <= thresh]

# Remove the temporary "count" column
data.drop("count", axis=1, inplace=True)


In [11]:
data.shape

(106546, 11)

In [12]:
# Sort the values in the list columns and replace empty values with an empty list
for c in list_cols:
    # Apply a lambda function to each cell in the specified list column
    data[c] = data[c].apply(lambda x: sorted(x) if type(x) == list else [])


In [13]:
# Reset the index of the DataFrame
data.reset_index(inplace=True)

# Rename the column "index" to "id"
data.rename({"index": "id"}, axis=1, inplace=True)


In [14]:
# Write the clean data to disk in JSON format
data.to_json(clean_data_path)


In [15]:
# Calculate feature counts for scoring
count_df = count_fn(data, features, list_cols)

# Save the calculated feature counts to a CSV file
count_df.to_csv(count_path, index=False)


---

## Model training

In [16]:
# Read the cleaned data from a JSON file into a DataFrame
data = pd.read_json(clean_data_path)

# Create a new DataFrame (df) by dropping the specified column (label)
df = data.drop(label, axis=1)


In [17]:
# Create a MinHash LSH Forest model
lsh = MinHashLSHForest(num_perm=n_perm)

# Create an LSH Graph object using the specified DataFrame (df) and parameters
lsh_graph = LSHGraph(df, lsh, features, id_col=id_col, n_perm=n_perm)


In [18]:
# Train the model
lsh_graph.update_graph()

Processing 0 of 106546
Processing 5000 of 106546
Processing 10000 of 106546
Processing 15000 of 106546
Processing 20000 of 106546
Processing 25000 of 106546
Processing 30000 of 106546
Processing 35000 of 106546
Processing 40000 of 106546
Processing 45000 of 106546
Processing 50000 of 106546
Processing 55000 of 106546
Processing 60000 of 106546
Processing 65000 of 106546
Processing 70000 of 106546
Processing 75000 of 106546
Processing 80000 of 106546
Processing 85000 of 106546
Processing 90000 of 106546
Processing 95000 of 106546
Processing 100000 of 106546
Processing 105000 of 106546


In [19]:
# Save the model (lsh_graph) to disk
with open(model_path, "wb") as f:
    pickle.dump(lsh_graph, f)


---

## Seed set extension

In [20]:
# Read the cleaned data
data = pd.read_json(clean_data_path)

In [21]:
# Read the seed set from a CSV file into a DataFrame
seed = pd.read_csv(seed_path)

# Extract a list of IDs from the "id" column in the DataFrame
seed_ids = list(seed["id"])


In [22]:
# Load the trained model from the file
lsh_graph = pickle.load(open(model_path, "rb"))


In [23]:
# Retrieve the neighbors of the seed set from the LSH graph
neighbors = lsh_graph.extract_neighbors(seed_ids)


In [24]:
# Select records (rows) from the DataFrame that are not in the seed set
df = data[~data["id"].isin(seed_ids)]


In [25]:
# Calculate the default click rate
def_click_rate = df[label].mean()  # Compute the mean (average) of the values in the specified column (label)
def_click_rate = round(def_click_rate * 100, 2)


In [26]:
# Score the neighbors using a scoring function
df = score_fn(data, count_path, features, list_cols, seed_ids, neighbors, label)


In [27]:
# Create and store an extension file with a calculated click rate
extn_click_rate = get_extn(df, seed_ids, label, extn_path, x=2)


In [28]:
# Print increase in click rate
print(f"Click rate increased from {def_click_rate}% to {extn_click_rate}%")


Click rate increased from 9.24% to 13.82%


---