In [38]:
import pandas as pd
from datetime import timedelta
#Data processing
from sklearn import preprocessing

In [39]:
df = pd.read_parquet('../../Sonar/seatunnel_all_information.parquet')

In [40]:
df['total_time']

0       3 days 01:27:48
1       3 days 04:15:10
2       0 days 05:24:03
3       4 days 00:22:26
4      12 days 00:39:27
             ...       
1063    0 days 00:43:10
1064    0 days 19:01:39
1065    0 days 00:16:22
1066    0 days 00:33:35
1067    0 days 00:09:14
Name: total_time, Length: 1068, dtype: timedelta64[ns]

In [41]:
df['time'] = df['total_time'].astype('int64')

In [42]:
df['time']

0        264468000000000
1        274510000000000
2         19443000000000
3        346946000000000
4       1039167000000000
              ...       
1063       2590000000000
1064      68499000000000
1065        982000000000
1066       2015000000000
1067        554000000000
Name: time, Length: 1068, dtype: int64

In [43]:
X = df.loc[:, ~df.columns.isin(['begin_sha', 'end_sha', 'begin_time', 'end_time', 'total_time'])]

In [44]:
cl = list(df['total_time'])


In [45]:
scaler = preprocessing.StandardScaler()
df_scaler = scaler.fit_transform(X)
df_scaler_col = pd.DataFrame(df_scaler)
df_scaler = df_scaler.T

In [46]:
df_scaler.shape

(15, 1068)

In [47]:
#Silhouette analysis
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [48]:
number_of_cluster = []

for i in range(2, len(df_scaler)):
    km = KMeans(n_clusters = i)
    km.fit(df_scaler)
    sil_avg = silhouette_score(df_scaler, km.labels_).round(4)
    number_of_cluster.append([sil_avg , i])

  km.fit(df_scaler)
  km.fit(df_scaler)


In [49]:
print(f'Optimum number of cluster: {max(number_of_cluster)[1]}')

Optimum number of cluster: 3


In [50]:
# Perform KMeans clustering
num_clusters = max(number_of_cluster)[1]
kmeans = KMeans(n_clusters=num_clusters)

In [51]:
cluster_labels = kmeans.fit_predict(X)



In [52]:
df['cluster'] = cluster_labels

In [53]:
df.head(3)

Unnamed: 0,begin_sha,end_sha,commits,additions,deletions,changed_files,total_time,begin_time,end_time,begin_Dispensables,...,begin_Change Preventers,begin_Couplers,begin_Object-Orientation Abusers,end_Dispensables,end_Bloaters,end_Change Preventers,end_Couplers,end_Object-Orientation Abusers,time,cluster
0,84be0f9fd057b1680d001de38802ce1c28d79f04,9a2efa51c7180b369ecfea17efaae813d9d0bfc5,1,10,4,3,3 days 01:27:48,2023-06-02T07:22:03Z,2023-06-05T08:49:51Z,500.0,...,262.0,304.0,304.0,500.0,261.0,261.0,304.0,693.0,264468000000000,0
1,84be0f9fd057b1680d001de38802ce1c28d79f04,5e03d22d6cb5bfdaf941b7749e0a313bd6ab2f86,1,2,1,1,3 days 04:15:10,2023-06-02T05:07:40Z,2023-06-05T09:22:50Z,500.0,...,262.0,304.0,304.0,501.0,262.0,262.0,304.0,693.0,274510000000000,0
2,84be0f9fd057b1680d001de38802ce1c28d79f04,e0d8519a9d7b6dfac29c1daf4199b6fe87781d0a,1,36,67,9,0 days 05:24:03,2023-06-02T03:16:04Z,2023-06-02T08:40:07Z,500.0,...,262.0,304.0,304.0,499.0,261.0,261.0,304.0,693.0,19443000000000,0


In [54]:
df.shape

(1068, 21)

In [55]:
df['timedelta'] = pd.to_timedelta(df['time'])

In [56]:
df['formatted_duration'] = df['timedelta'].apply(lambda x: f"{x.days} days {x.seconds // 3600:02d}:{(x.seconds // 60) % 60:02d}:{x.seconds % 60:02d}")

In [63]:
df['timedelta'][:int(0.7 * len(df))].median()

Timedelta('1 days 00:17:13')

In [65]:
df['timedelta'].min()

Timedelta('0 days 00:00:05')

In [64]:
df['timedelta'].max()

Timedelta('110 days 16:44:58')

In [57]:
cluster_0 = df[df['cluster'] == 0]
print(cluster_0['timedelta'].min())
print(cluster_0['timedelta'].max())
cluster_0.shape

0 days 00:00:05
7 days 09:20:19


(931, 23)

In [69]:
cluster_0[['total_time','begin_Bloaters', 'end_Bloaters']]

Unnamed: 0,total_time,begin_Bloaters,end_Bloaters
0,3 days 01:27:48,262.0,261.0
1,3 days 04:15:10,262.0,262.0
2,0 days 05:24:03,262.0,261.0
3,4 days 00:22:26,262.0,261.0
5,4 days 01:57:26,262.0,261.0
...,...,...,...
1063,0 days 00:43:10,14.0,15.0
1064,0 days 19:01:39,12.0,14.0
1065,0 days 00:16:22,12.0,14.0
1066,0 days 00:33:35,3.0,14.0


In [71]:
cluster_0[['total_time', 'begin_Couplers', 'end_Couplers']]

Unnamed: 0,total_time,begin_Couplers,end_Couplers
0,3 days 01:27:48,304.0,304.0
1,3 days 04:15:10,304.0,304.0
2,0 days 05:24:03,304.0,304.0
3,4 days 00:22:26,304.0,304.0
5,4 days 01:57:26,304.0,304.0
...,...,...,...
1063,0 days 00:43:10,7.0,7.0
1064,0 days 19:01:39,8.0,7.0
1065,0 days 00:16:22,8.0,7.0
1066,0 days 00:33:35,3.0,7.0


In [58]:
cluster_1 = df[df['cluster'] == 1]
print(cluster_1['timedelta'].min())
print(cluster_1['timedelta'].max())
cluster_1.shape

30 days 19:11:30
110 days 16:44:58


(24, 23)

In [59]:
cluster_2 = df[df['cluster'] == 2]
print(cluster_2['timedelta'].min())
print(cluster_2['timedelta'].max())
cluster_2.shape

7 days 12:04:52
29 days 12:33:22


(113, 23)