In [None]:
!pwd
!conda env list
!nvidia-smi
import os
os.environ['RAPIDS_GPU_MEMORY_POOL_SIZE'] = '8589934592'
print(os.environ['RAPIDS_GPU_MEMORY_POOL_SIZE'])


In [None]:
# conda install -c rapidsai -c nvidia -c conda-forge cudf=23.04


In [None]:
##### FUNCTIONS
import pandas as pd
import cudf
import cuml
import numpy as np

# clean cudf dataframe
def cleanDF(df):
    all_cols = ['ID', 'Language', 'Subscription Type', 'Subscription Event Type',
                'Purchase Store', 'Purchase Amount', 'Currency',
                'Subscription Start Date', 'Subscription Expiration', 'Demo User',
                'Free Trial User', 'Free Trial Start Date', 'Free Trial Expiration',
                'Auto Renew', 'Country', 'User Type', 'Lead Platform',
                'Email Subscriber', 'Push Notifications', 'Send Count', 'Open Count',
                'Click Count', 'Unique Open Count', 'Unique Click Count',
                'App Session Platform', 'App Activity Type', 'App Session Date']

    category_cols = ['Language', 'Subscription Type', 'Subscription Event Type', 'Purchase Store',
                     'Currency', 'Country', 'User Type', 'Lead Platform',
                     'App Session Platform', 'App Activity Type']

    int_cols = ['ID', 'Send Count', 'Open Count', 'Click Count',
                'Unique Open Count', 'Unique Click Count']

    date_cols = ['Subscription Start Date', 'Subscription Expiration',
                 'App Session Date', 'Free Trial Start Date', 'Free Trial Expiration']

    bool_cols = ['Demo User', 'Free Trial User', 'Auto Renew',
                 'Email Subscriber', 'Push Notifications']

    # if 'Auto Renew' is in df.columns
    if 'Auto Renew' in df.columns:
        # convert to True/False
        df['Auto Renew'] = df['Auto Renew'].replace({'On': 'Yes', 'Off': 'No'})

    # Keep only columns that exist in df
    all_cols = [col for col in all_cols if col in df.columns]
    category_cols = [col for col in category_cols if col in df.columns]
    int_cols = [col for col in int_cols if col in df.columns]
    date_cols = [col for col in date_cols if col in df.columns]
    bool_cols = [col for col in bool_cols if col in df.columns]


    # convert na to 0 in int_cols
    df[int_cols] = df[int_cols].fillna(0)

    # stuff expects objects instead of categories
    # df[category_cols] = df[category_cols].astype('category')
    df[int_cols] = df[int_cols].astype('int64')
    df[date_cols] = df[date_cols].astype('datetime64[ns]')
    df[bool_cols] = df[bool_cols].astype('bool')

    return df



In [None]:
# from functions import cleanDF
# cudf.set_option('display.max_columns', None)
# cudf.set_option('display.width', None)

# Read in the data
# mergedDF = pd.read_csv('../data/mergedClean.csv', index_col=0)
appDF = cudf.read_csv('/root/data/app_nonull.csv')
subscriberDF = cudf.read_csv('/root/data/subscriberClean.csv')

appDF = cleanDF(appDF)
subscriberDF = cleanDF(subscriberDF)

mergedDF = subscriberDF.merge(appDF, on='ID', how='left')
mergedDF = cleanDF(mergedDF)

appDF_start = appDF.copy()
subscriberDF_start = subscriberDF.copy()
fullDF_start = mergedDF.copy()
print(mergedDF.shape)
print(mergedDF.dtypes)
mergedDF.head()

In [None]:
# import matplotlib.pyplot as plt
# import gower
# from scipy.cluster.hierarchy import dendrogram, linkage
# from sklearn.preprocessing import StandardScaler

# # Preprocessing
# # Replace boolean columns with 0 and 1
# subscriberDF_bool = subscriberDF.select_dtypes(include=bool).astype(int)
# subscriberDF_non_bool = subscriberDF.select_dtypes(exclude=bool)

# # Combine the preprocessed boolean columns and other columns back
# subscriberDF_processed = pd.concat(
#     [subscriberDF_non_bool, subscriberDF_bool], axis=1)

# # Standardize numerical columns
# num_features = subscriberDF_processed.select_dtypes(
#     include=['int64', 'float64']).columns
# scaler = StandardScaler()
# subscriberDF_processed[num_features] = scaler.fit_transform(
#     subscriberDF_processed[num_features])

# # Calculate Gower distance matrix
# gower_distances = gower.gower_matrix(subscriberDF_processed)

# # Perform agglomerative clustering
# # You can also try other linkage methods like 'single', 'complete', or 'average'
# Z = linkage(gower_distances, method='ward')

# # Plot the dendrogram
# plt.figure(figsize=(10, 6))
# dendrogram(Z)
# plt.title("Dendrogram for Agglomerative Hierarchical Clustering")
# plt.xlabel("Subscriber Index")
# plt.ylabel("Distance")
# plt.show()


In [None]:
### USING RAPIDS ###
import matplotlib.pyplot as plt
import cudf
import cupy as cp
import cuml
from cuml import AgglomerativeClustering
import gower

# subscriberDF = cudf.from_pandas(subscriberDF)

# Preprocessing
# Replace boolean columns with 0 and 1
subscriberDF_bool = subscriberDF.select_dtypes(include=bool).astype(int)
subscriberDF_non_bool = subscriberDF.select_dtypes(exclude=bool)

# Combine the preprocessed boolean columns and other columns back
subscriberDF_processed = cudf.concat(
    [subscriberDF_non_bool, subscriberDF_bool], axis=1)

# Convert categorical columns to strings
cat_features = subscriberDF_processed.select_dtypes(include='category').columns
subscriberDF_processed[cat_features] = subscriberDF_processed[cat_features].astype(
    str)

# Standardize numerical columns
num_features = subscriberDF_processed.select_dtypes(
    include=['int64', 'float64']).columns
scaler = StandardScaler()
subscriberDF_processed[num_features] = scaler.fit_transform(
    subscriberDF_processed[num_features])

# Calculate Gower distance matrix
gower_distances = gower.gower_matrix(subscriberDF_processed)

gower_distances_clean = cp.nan_to_num(gower_distances.to_gpu_array(), nan=0, posinf=cp.finfo(
    cp.float64).max, neginf=-cp.finfo(cp.float64).max)

# Perform agglomerative clustering
# You can also try other linkage methods like 'single', 'complete', or 'average'
agg_cluster = AgglomerativeClustering(n_clusters=None, linkage='ward',
                                      distance_threshold=0.0)
agg_cluster.fit(gower_distances_clean)

# Plot the dendrogram
plt.figure(figsize=(10, 6))
dendrogram(agg_cluster.distances_.to_array())
plt.title("Dendrogram for Agglomerative Hierarchical Clustering")
plt.xlabel("Subscriber Index")
plt.ylabel("Distance")
plt.show()


In [None]:
import cudf
import cupy as cp
from cuml.preprocessing import StandardScaler
from cuml.cluster import KMeans

# Convert pandas dataframe to cudf dataframe
# subscriberDF = cudf.from_pandas(subscriberDF)

# Select numerical columns
num_features = ['Purchase Amount', 'Send Count', 'Open Count', 'Click Count',                 'Unique Open Count', 'Unique Click Count']

# Standardize numerical columns
scaler = StandardScaler()
subscriberDF[num_features] = scaler.fit_transform(subscriberDF[num_features])

# Convert categorical columns to numerical columns
for column in subscriberDF.columns:
    if subscriberDF[column].dtype.name == 'category':
        subscriberDF[column] = subscriberDF[column].cat.codes.astype('float32')

# Perform KMeans clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(subscriberDF)

# Retrieve cluster labels
cluster_labels = kmeans.labels_.to_array()

# Visualize cluster distribution
plt.hist(cluster_labels, bins=n_clusters)
plt.xlabel('Cluster')
plt.ylabel('Number of Subscribers')
plt.title('Distribution of Subscribers by Cluster')
plt.show()


In [None]:
subscriberDF.dtypes

In [None]:
import cudf
import cuml
from cuml.preprocessing import StandardScaler
from cuml.cluster import KMeans

# Load your dataset
# Assuming your dataset is in a CSV format, adjust the code according to your data format
# df = cudf.read_csv("your_data.csv")

# Preprocessing
# Convert boolean columns to integers
bool_columns = ['Demo User', 'Free Trial User',
                'Auto Renew', 'Email Subscriber', 'Push Notifications']
for col in bool_columns:
    subscriberDF[col] = subscriberDF[col].astype('int64')

# Convert datetime columns to Unix timestamps
date_columns = ['Subscription Start Date', 'Subscription Expiration',
                'Free Trial Start Date', 'Free Trial Expiration']
for col in date_columns:
    subscriberDF[col] = subscriberDF[col].astype('int64')

# Normalize numerical features
numerical_columns = subscriberDF.columns
scaler = StandardScaler()
subscriberDF[numerical_columns] = scaler.fit_transform(
    subscriberDF[numerical_columns])

# Clustering
# Choose the number of clusters you want to create
n_clusters = 5

# Instantiate the KMeans model
kmeans = KMeans(n_clusters=n_clusters)

# Fit the model to your data
kmeans.fit(subscriberDF)

# Assign the cluster labels to each data point
subscriberDF['cluster'] = kmeans.labels_

# Save the clustered data if necessary
subscriberDF.to_csv("/root/data/clustered_data.csv", index=False)


In [None]:
clusterPD = pd.read_csv("/root/data/clustered_data.csv")
clusterPD.head()

In [None]:
import cupy as cp
print(cp.cuda.Device(0).id)


In [None]:
!nvidia-smi

In [None]:
import numpy as np
import gower
from sklearn.preprocessing import StandardScaler


# Preprocessing
# Replace boolean columns with 0 and 1
subscriberDF_bool = subscriberDF.select_dtypes(include=bool).astype(int)
subscriberDF_non_bool = subscriberDF.select_dtypes(exclude=bool)

# Combine the preprocessed boolean columns and other columns back
subscriberDF_processed = pd.concat(
    [subscriberDF_non_bool, subscriberDF_bool], axis=1)

# Convert categorical columns to strings
cat_features = subscriberDF_processed.select_dtypes(include='category').columns
subscriberDF_processed[cat_features] = subscriberDF_processed[cat_features].astype(
    str)

# Standardize numerical columns
num_features = subscriberDF_processed.select_dtypes(
    include=['int64', 'float64']).columns
scaler = StandardScaler()
subscriberDF_processed[num_features] = scaler.fit_transform(
    subscriberDF_processed[num_features])

gower_distances = gower.gower_matrix(subscriberDF_processed)

# Check for missing or infinite values in gower_distances
if np.isnan(gower_distances).any():
    print("gower_distances contains missing values.")
if np.isinf(gower_distances).any():
    print("gower_distances contains infinite values.")


In [None]:
import matplotlib.pyplot as plt
import gower
from cuml.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

# Preprocessing
# Replace boolean columns with 0 and 1
subscriberDF_bool = subscriberDF.select_dtypes(include=bool).astype(int)
subscriberDF_non_bool = subscriberDF.select_dtypes(exclude=bool)

# Combine the preprocessed boolean columns and other columns back
subscriberDF_processed = pd.concat(
    [subscriberDF_non_bool, subscriberDF_bool], axis=1)

# Convert categorical columns to strings
cat_features = subscriberDF_processed.select_dtypes(include='category').columns
subscriberDF_processed[cat_features] = subscriberDF_processed[cat_features].astype(
    str)

# Standardize numerical columns
num_features = subscriberDF_processed.select_dtypes(
    include=['int64', 'float64']).columns
scaler = StandardScaler()
subscriberDF_processed[num_features] = scaler.fit_transform(
    subscriberDF_processed[num_features])

# Calculate Gower distance matrix
gower_distances = gower.gower_matrix(subscriberDF_processed)

# Perform agglomerative clustering on GPU
# You can also try other linkage methods like 'single', 'complete', or 'average'
agg_clustering = AgglomerativeClustering(
    n_clusters=None, distance_threshold=0.7, affinity="precomputed", linkage='ward')
labels = agg_clustering.fit_predict(gower_distances)

# Plot the dendrogram
plt.figure(figsize=(10, 6))
dendrogram({'color_list': agg_clustering.color_list_, 'icoord': agg_clustering.icoord_,
           'dcoord': agg_clustering.dcoord_, 'ivl': agg_clustering.subscriberDF_processed.index.tolist()})
plt.title("Dendrogram for Agglomerative Hierarchical Clustering")
plt.xlabel("Subscriber Index")
plt.ylabel("Distance")
plt.show()


In [None]:
from scipy.cluster.hierarchy import fcluster

# Choose a distance threshold based on the dendrogram
distance_threshold = 10  # Adjust this value based on the dendrogram

# Assign cluster labels to each data point
cluster_labels = fcluster(Z, t=distance_threshold, criterion='distance')

# Add the cluster labels to the original DataFrame
subscriberDF['Cluster'] = cluster_labels

# Analyze the clusters
cluster_summary = subscriberDF.groupby('Cluster').mean()
print(cluster_summary)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import cudf
import cupy
import matplotlib.pyplot as plt
from cuml.preprocessing import StandardScaler
from cuml.cluster import KMeans

# Load the data
subscriberDF = cudf.from_pandas(subscriberDF_start.copy())

# Remove the ID column since it's not useful for clustering
subscriberDF = subscriberDF.drop("ID", axis=1)

# Convert the categorical columns to integer type
categorical_cols = subscriberDF.select_dtypes(
    include=['category']).columns.tolist()
for col in categorical_cols:
    subscriberDF[col] = subscriberDF[col].cat.codes.astype(int)

# Fill missing values with the mode value
subscriberDF = subscriberDF.fillna(subscriberDF.mode().iloc[0])

# Standardize the data
scaler = StandardScaler()
subscriberDF_scaled = scaler.fit_transform(
    subscriberDF.values.astype(np.float32))

# Convert the data to a cuDF dataframe
subscriberDF_cudf = cudf.DataFrame.from_gpu_matrix(
    cupy.asarray(subscriberDF_scaled))

# Choose the number of clusters
costs = []
for num_clusters in range(1, 10):
    kmeans = KMeans(n_clusters=num_clusters,
                    init='k-means||', n_init=5, verbose=2)
    kmeans.fit(subscriberDF_cudf)
    costs.append(kmeans.inertia_)
plt.plot(range(1, 10), costs)
plt.xlabel("Number of Clusters")
plt.ylabel("Cost")
plt.show()

# Fit the model with the chosen number of clusters
kmeans = KMeans(n_clusters=3, init='k-means||', n_init=5, verbose=2)
clusters = kmeans.fit_predict(subscriberDF_cudf)

# Visualize the results using scatter plots
fig, axs = plt.subplots(2, 2, figsize=(10, 10))
for i in range(2):
    for j in range(2):
        x_col = categorical_cols[i*2+j]
        y_col = categorical_cols[(i+1)*2-1-j]
        axs[i][j].scatter(subscriberDF[x_col],
                          subscriberDF[y_col], c=clusters.to_array())
        axs[i][j].set_xlabel(x_col)
        axs[i][j].set_ylabel(y_col)
plt.show()
