In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import kmeans and dbscan
from sklearn.cluster import KMeans, DBSCAN

In [None]:
train_x = pd.read_csv('train/X_train_cluster.csv')
train_y = pd.read_csv('train/y_train_cluster.csv')
test_x = pd.read_csv('test/X_test_cluster.csv')
test_y = pd.read_csv('test/y_test_cluster.csv')

OG_data = pd.read_csv("smoking.csv")


In [None]:
OG_data_y = OG_data['SMK_stat_type_cd']
OG_data_x = OG_data.drop(columns=['SMK_stat_type_cd'])

# Keep only numerical columns in OG_data_x
OG_data_x = OG_data_x.select_dtypes(include=['number'])

OG_data_x

In [None]:
train_x.describe()

In [None]:
train_x.apply(lambda col: (col > 0.0000022964).sum())

In [None]:
# Remove all rows where the first three columns are bigger than 0.1
train_x_and_y = pd.concat([train_x, train_y], axis=1)
#train_x_remove_outlier = train_x_and_y[(train_x_and_y['PC1'] <= 0.1) & (train_x_and_y['PC2'] <= 0.1) & (train_x_and_y['PC3'] <= 0.1)]
#train_x_remove_outlier = train_x_and_y[(train_x_and_y['PC1'] <= 0.000049172) & (train_x_and_y['PC2'] <= 0.000003) & (train_x_and_y['PC3'] <= 0.0000000008)]
train_x_remove_outlier = train_x_and_y[(train_x_and_y['PC1'] <= 0.1) & (train_x_and_y['PC2'] <= 0.0000022965) & (train_x_and_y['PC3'] <= 0.1)] # best for now
#train_x_remove_outlier = train_x_and_y[(train_x_and_y['PC1'] <= 0.0000491717) & (train_x_and_y['PC2'] <= 0.0000022964) & (train_x_and_y['PC3'] <= 0.00000000065)]

train_y_remove_outlier = train_x_remove_outlier['SMK_stat_type_cd']
train_x_remove_outlier = train_x_remove_outlier.drop(['SMK_stat_type_cd'], axis=1)

In [None]:
train_x.shape

In [None]:
train_x_remove_outlier.shape

In [None]:
# Scale the data between 0 and 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()


# Scale the data and convert back to DataFrame
train_x_scaled = pd.DataFrame(scaler.fit_transform(train_x_remove_outlier), 
                              columns=train_x_remove_outlier.columns, 
                              index=train_x_remove_outlier.index)

test_x_scaled = pd.DataFrame(scaler.transform(test_x), 
                             columns=test_x.columns, 
                             index=test_x.index)
train_x_full = train_x_scaled
train_y_full = train_y_remove_outlier
train_x_scaled = train_x_scaled[-100000:]
train_y_remove_outlier = train_y_remove_outlier[-100000:]

In [None]:
train_x.describe()

In [None]:
train_x_full.describe()

In [None]:
train_x_scaled.describe()

In [None]:
# Hisogram of each column
train_x_scaled.hist(figsize=(10, 10), bins=5000)
plt.tight_layout()
plt.show()

In [None]:
test_x.shape

In [None]:
# Count the number of rows where the first three columns are bigger than 0.1
train_x_scaled.apply(lambda col: (col > 0.5).sum())

In [None]:
# KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(train_x)
y_pred_train = kmeans.predict(train_x)
y_pred_test = kmeans.predict(test_x)

# with scaled data
kmeans.fit(train_x_full)
y_pred_train_scaled_kmeans = kmeans.predict(train_x_full)

# On untouched data
y_pred_train_og_kmean = kmeans.fit_predict(OG_data_x)

In [None]:
# Gaussian Mixture Model
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3)
y_pred_train_gmm = gmm.fit_predict(train_x_full)

In [None]:
# Fuzzy C-means
from fcmeans import FCM
fcm = FCM(n_clusters=3)
fcm.fit(train_x_full.values)
y_pred_train_fcm = fcm.predict(train_x_full.values)


In [None]:
# Evaluate the clustering results
from sklearn.metrics import adjusted_rand_score

rand_score = adjusted_rand_score(train_y.values.ravel(), y_pred_train)
print("Adjusted Rand Score:", rand_score)

rand_score = adjusted_rand_score(test_y.values.ravel(), y_pred_test)
print("Adjusted Rand Score:", rand_score)

In [None]:
# Kmeans
rand_score = adjusted_rand_score(train_y_full.values.ravel(), y_pred_train_scaled_kmeans)
print("K-means Adjusted Rand Score:", rand_score)

In [None]:
# Gaussian Mixture Model evaluation
rand_score = adjusted_rand_score(train_y_full.values.ravel(), y_pred_train_gmm)
print("Gaussian Mixture Model Adjusted Rand Score:", rand_score)


In [None]:
# Fuzzy C-means evaluation
rand_score = adjusted_rand_score(train_y_full.values.ravel(), y_pred_train_fcm)
print("Fuzzy C-means Adjusted Rand Score:", rand_score)

In [None]:
rand_score_og = adjusted_rand_score(OG_data_y.values.ravel(), y_pred_train_og_kmean)
print("Adjusted Rand Score:", rand_score_og)

In [None]:
unique_values, counts = np.unique(y_pred_train_scaled_kmeans, return_counts=True)
value_counts = dict(zip(unique_values, counts))
print(value_counts)