# User Clustering

In [6]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np

In [2]:
# Reading the CSV file into a DataFrame
user_profile_df = pd.read_csv('./data/user_profile_df.csv')
user_profile_df

Unnamed: 0,user_id,nickname,user_real_nickname,start_date,sum_reply,sum_son_reply,sum_emotion,recent_n_reply,recent_n_deleted,recent_n_emotion,recent_rate_emotion,recent_rate_self_delete,total_reply,heavy
0,NXIM,myls****,mylse1028,2006-08-25,1462,311,11356,9,0,23,56%,0%,1773,0.0
1,2Mh8r,chun****,뭉치,2006-04-21,1686,118,15530,76,22,93,49%,29%,1804,0.0
2,81WDL,hoeu****,쿠쿠쿠리,2016-04-10,3,0,525,0,0,0,0%,0%,3,0.0
3,vHmK,livi****,unseen,2017-08-27,2669,75,25747,35,17,104,85%,49%,2744,0.0
4,2lvoK,inyj****,연짱버디,2006-11-03,1756,110,8982,62,0,1087,91%,0%,1866,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44437,1HfFX,kata****,행복여행,2006-06-27,13723,69,98204,18,0,146,88%,0%,13792,1.0
44438,2vgvO,kswb****,ksw,2007-04-01,7048,417,43742,187,2,563,78%,1%,7465,0.0
44439,EbqI,pota****,pot,2020-01-23,1908,81,5410,3,0,1,100%,0%,1989,0.0
44440,PAXV,kkhb****,소언다문,2019-02-19,1265,63,2941,5,0,20,95%,0%,1328,0.0


In [4]:
# Convert the 'start_date' column to datetime format
user_profile_df['start_date'] = pd.to_datetime(user_profile_df['start_date'])

# Calculate the number of days since the start date
current_date = datetime.now()
user_profile_df['days_since_start'] = (current_date - user_profile_df['start_date']).dt.days

# Display the first few rows of the DataFrame with the new column
user_profile_df.head()

Unnamed: 0,user_id,nickname,user_real_nickname,start_date,sum_reply,sum_son_reply,sum_emotion,recent_n_reply,recent_n_deleted,recent_n_emotion,recent_rate_emotion,recent_rate_self_delete,total_reply,heavy,days_since_start
0,NXIM,myls****,mylse1028,2006-08-25,1462,311,11356,9,0,23,56%,0%,1773,0.0,6241
1,2Mh8r,chun****,뭉치,2006-04-21,1686,118,15530,76,22,93,49%,29%,1804,0.0,6367
2,81WDL,hoeu****,쿠쿠쿠리,2016-04-10,3,0,525,0,0,0,0%,0%,3,0.0,2725
3,vHmK,livi****,unseen,2017-08-27,2669,75,25747,35,17,104,85%,49%,2744,0.0,2221
4,2lvoK,inyj****,연짱버디,2006-11-03,1756,110,8982,62,0,1087,91%,0%,1866,0.0,6171


In [5]:
# Remove commas from the columns and convert them to integers
columns_to_convert = ['sum_reply', 'sum_son_reply', 'recent_n_reply']

for col in columns_to_convert:
    user_profile_df[col] = user_profile_df[col].str.replace(',', '').astype(int)

# Check the data types and first few rows to ensure the columns are converted correctly
user_profile_df.dtypes, user_profile_df.head()

(user_id                            object
 nickname                           object
 user_real_nickname                 object
 start_date                 datetime64[ns]
 sum_reply                           int64
 sum_son_reply                       int64
 sum_emotion                         int64
 recent_n_reply                      int64
 recent_n_deleted                    int64
 recent_n_emotion                   object
 recent_rate_emotion                object
 recent_rate_self_delete            object
 total_reply                         int64
 heavy                             float64
 days_since_start                    int64
 dtype: object,
   user_id  nickname user_real_nickname start_date  sum_reply  sum_son_reply  \
 0    NXIM  myls****          mylse1028 2006-08-25       1462            311   
 1   2Mh8r  chun****                 뭉치 2006-04-21       1686            118   
 2   81WDL  hoeu****               쿠쿠쿠리 2016-04-10          3              0   
 3    vHmK  livi***

In [7]:
# Select the features for clustering
selected_features = ['sum_reply', 'sum_son_reply', 'recent_n_reply', 'days_since_start']
cluster_data = user_profile_df[selected_features]

# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cluster_data)

# Apply K-means clustering (k=2 for "heavy" and "normal")
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(scaled_data)

# Add the cluster labels to the original DataFrame
user_profile_df['user_type'] = np.where(kmeans.labels_ == 0, 'normal', 'heavy')

# Display first few rows with the new 'user_type' column
user_profile_df.head()

  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,user_id,nickname,user_real_nickname,start_date,sum_reply,sum_son_reply,sum_emotion,recent_n_reply,recent_n_deleted,recent_n_emotion,recent_rate_emotion,recent_rate_self_delete,total_reply,heavy,days_since_start,user_type
0,NXIM,myls****,mylse1028,2006-08-25,1462,311,11356,9,0,23,56%,0%,1773,0.0,6241,heavy
1,2Mh8r,chun****,뭉치,2006-04-21,1686,118,15530,76,22,93,49%,29%,1804,0.0,6367,heavy
2,81WDL,hoeu****,쿠쿠쿠리,2016-04-10,3,0,525,0,0,0,0%,0%,3,0.0,2725,heavy
3,vHmK,livi****,unseen,2017-08-27,2669,75,25747,35,17,104,85%,49%,2744,0.0,2221,heavy
4,2lvoK,inyj****,연짱버디,2006-11-03,1756,110,8982,62,0,1087,91%,0%,1866,0.0,6171,heavy


In [11]:
user_profile_df['user_type'].value_counts(normalize=True)

user_type
heavy     0.838711
normal    0.161289
Name: proportion, dtype: float64

In [None]:
user_profile_df.to_csv('./data/user_profile_with_clusters_v2.csv')