# Clustering

#### Preparing data for clustering

In [None]:
import csv
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta
import os
import itertools
from IPython.display import display, HTML

In [None]:
def save_data_to_file(folder_name, file_name, data):
    try:
        # Create target Directory
        os.mkdir(folder_name)
        print("Directory " , folder_name ,  " Created ") 
    except FileExistsError:
        e = 1
        #print("Directory " , folder_name ,  " already exists")
    data.to_csv(folder_name + "/" + file_name, index=False)

In [None]:
def save_data_to_file_with_index(folder_name, file_name, data):
    try:
        # Create target Directory
        os.mkdir(folder_name)
        print("Directory " , folder_name ,  " Created ") 
    except FileExistsError:
        e = 1
        #print("Directory " , folder_name ,  " already exists")
    data.to_csv(folder_name + "/" + file_name)

In [None]:
features = [("FrequencyOfUserPostsWithoutZeros","std_post_frequency"),
            ("FrequencyOfUserPostsWithoutZeros","q3_post_frequency"),
            ("NumberOfReceivedResponsesToUsersPostsWithoutZeros","number_of_received_responses_to_users_posts_std"),
            ("NumberOfReceivedResponsesToUsersPostsWithoutZeros","number_of_received_responses_to_users_posts_max"),
            ("NumberOfReceivedResponsesUnderUsersCommentsWithoutZeros", "number_of_received_responses_under_users_comments_q3"),
            ("NumberOfReceivedResponsesUnderUsersCommentsWithoutZeros","number_of_received_responses_under_users_comments_max"),
            ("NumberOfWordsInOwnResponsesOfUsersPostsWithoutZeros", "number_of_words_in_own_responses_of_users_posts_q3"),
            ("NumberOfWordsInResponsesOfUsersPostsWithoutZeros","nnumber_of_words_in_responses_of_users_posts_median"),
            ("SentimentOfUsersPostsWithoutZeros","posts_sentiment_min"),
            ("FrequencyOfUserCommentsWithoutZeros", "mean_comments_frequency"),
            ("NumberOfWordsInUsersCommentsWithoutZeros","number_of_words_in_users_comments_avg"),
            ("NumberOfWordsInUsersPostsWithoutZeros", "number_of_words_in_users_posts_q3"),
            ("NumberOfReceivedResponsesToUsersPostsWithoutZeros", "number_of_received_responses_to_users_posts_q3"),
            ("NumberOfCommentsWrittenByUserUnderHisOwnPostsWithoutZeros", "number_of_comments_written_by_user_under_his_own_posts_q3")
           ]

In [None]:
start_date_comments = datetime.date(2008, 12, 9)
end_date_comments = datetime.date(2013,11, 16)

In [None]:
df1 = pd.read_csv("FrequencyOfUserCommentsWithoutZeros" + "/feature_" + str(start_date_comments) + ".csv")[["user_id","mean_comments_frequency"]]
df1

In [None]:
def merge_stats(start_date):
    statistics_to_join = []
    for feat in features:
        (file_name, feature) = feat
        statistics_to_join.append(pd.read_csv(file_name + "/feature_" + str(start_date) + ".csv")[['user_id',feature]])
    
    merged_df = statistics_to_join[0]
    for i in range(1, len(statistics_to_join)):
        merged_df = pd.merge(merged_df, statistics_to_join[i],how='outer',on=['user_id'])
    return merged_df.fillna(0)

In [None]:
def create_cluster_data(start_date, end_date):
    while start_date < end_date:
        data = merge_stats(start_date)
        #print(str(len(data)) + " " + str(start_date))
        save_data_to_file("Cluster_Data", "cluster" + str(start_date) + ".csv", data)
        start_date += timedelta(days=14)

In [None]:
#create_cluster_data(start_date_comments, end_date_comments)

#### Clustering date range

In [None]:
#10170 2009-12-08
#2733 2013-10-22
start_date_clustering = datetime.date(2009, 12, 8)
end_date_clustering = datetime.date(2013,10, 22)

## Choose best k

#### Generate scores for different k

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics

In [None]:
def benchmark_custering(X):
    columns=['k', 'score_mean', 'scored_std']
    results = pd.DataFrame(columns=columns)
    for k in range(3,11):
        scores = []
        for sample_num in range(0,5):
            kmeans = KMeans(n_clusters=k).fit(X)
            labels = kmeans.labels_
            scores.append(metrics.calinski_harabasz_score(X, labels))
        row = pd.DataFrame([[k, np.mean(scores), np.std(scores)]], columns=columns)
        results = pd.concat([row, results], ignore_index=True)
    return results

In [None]:
def generate_benchmark_clustering_data(start_date, end_date):
    while start_date < end_date:
        df = pd.read_csv("Cluster_Data" + "/cluster" + str(start_date) + ".csv").drop(columns =["user_id"])
        X = MinMaxScaler().fit_transform(df)
        results = benchmark_custering(X)
        save_data_to_file("Cluster_Benchmarks", "cluster_benchmark" + str(start_date) + ".csv", results)
        print("Results for {} saved".format(start_date))
        start_date += timedelta(days=14)

In [None]:
generate_benchmark_clustering_data(start_date_clustering, end_date_clustering)

#### Generate best k for given slot

In [None]:
def better(best, row):
    score = 0
    if (best["score_mean"]  < row["score_mean"]):
        score+=1
    if (best["score_mean"] + best['scored_std']  < row["score_mean"] + row['scored_std']):
        score+=1
    if (best["score_mean"] - best['scored_std']  < row["score_mean"] - row['scored_std']):
        score+=1
    if(score >= 2):
        return row
    else:
        return best

In [None]:
def choose_best_k(df):
    best_row = df.iloc[0]
    for index, row in df.iterrows():
        best_row = better(best_row, row)
    return best_row["k"]

In [None]:
def generate_best_k_for_slots(start_date, end_date):
    columns=['date', 'k']
    best_k_dataframe = pd.DataFrame(columns=columns)
    while start_date < end_date:
        df = pd.read_csv("Cluster_Benchmarks" + "/cluster_benchmark" + str(start_date) + ".csv")
        k = choose_best_k(df)
        row = pd.DataFrame([[start_date, k]], columns=columns)
        best_k_dataframe = pd.concat([row, best_k_dataframe], ignore_index=True)
        start_date += timedelta(days=14)
    save_data_to_file("Cluster_Best_k", "cluster_best_k.csv", best_k_dataframe)
    print("Results for best_k saved")


In [None]:
generate_best_k_for_slots(start_date_clustering, end_date_clustering)

#### Generate labeled data with best K number of clusters for given slot

In [None]:
for index, row in df.iterrows():
    data_to_cluster = pd.read_csv("Cluster_Data" + "/cluster" + row["date"] + ".csv")
    data_without_id = data_to_cluster.drop(columns =["user_id"])
    data_to_cluster
    X = MinMaxScaler().fit_transform(data_without_id)
    kmeans = KMeans(n_clusters=int(row['k'])).fit(X)
    labels_df = pd.DataFrame(kmeans.labels_, columns=["label"])
    labeled_users = pd.concat([data_to_cluster, labels_df], axis=1)
    cluster_df = pd.DataFrame(kmeans.cluster_centers_, columns=data_without_id.columns.values)
    
    save_data_to_file("Labeled_users", "labeled_users"+ row["date"] +".csv", labeled_users)
    save_data_to_file("Cluster_centers", "cluster_centers"+ row["date"] +".csv", cluster_df)

#### Generate statistics for clusters

In [None]:
# df = pd.read_csv("Labeled_users" + "/labeled_users" + str(start_date_clustering) + ".csv").drop(columns =["user_id"])
# features = df.drop(columns =["label"]).columns.values
# aggreagates = { feat : stats for feat in features }
# features = df.drop(columns =["label"]).columns.values
# aggreagates = { feat : stats for feat in features }


In [None]:
def generate_custer_statistics(start_date, end_date):
    stats = [np.mean, np.std, np.min, np.max]
    column_names = columns={'mean': 'mean','std': 'stddev', 'amin': 'min', 'amax': 'max'}
    
    #get sample feature names and create aggregats eg {'std_post_frequency': [np.mean, np.std, np.min, np.max]}
    sample = pd.read_csv("Labeled_users" + "/labeled_users" + str(start_date) + ".csv").drop(columns =["user_id"])
    features = sample.drop(columns =["label"]).columns.values
    aggreagates = { feat : stats for feat in features }
    
    while start_date < end_date:
        df = pd.read_csv("Labeled_users" + "/labeled_users" + str(start_date) + ".csv").drop(columns =["user_id"])
        
        stats = (df.groupby(['label']).agg(aggreagates).rename(column_names))
        stats_trans_df = stats.T
        
        save_data_to_file_with_index("ClustersStatistics", "clusters_stats"+ str(start_date) +".csv", stats_trans_df)
        start_date += timedelta(days=14)

In [None]:
generate_custer_statistics(start_date_clustering, end_date_clustering)

In [None]:
def display_stats(start_date, end_date):
    while start_date < end_date:
        print("Cluster for {}".format(start_date))
        display(pd.read_csv("ClustersStatistics" + "/clusters_stats" + str(start_date) + ".csv"))
        start_date += timedelta(days=14)

In [None]:
display_stats(start_date_clustering, end_date_clustering)

In [None]:
Wnioski:
    
    Cluster for 2009-12-08:
        
    W pierwszym miesiacu mamy 3 klastry, widac wyraznie roznice w zachowaniu
    Klaster 0 to typ osoby ktora postuje dosc czesto, jego posty są raczej pozytywne,
    poniewaz minimalna wartosc sentymentu srednio przyjmuje wartosci neutralne
    Dostaje duzo odpowiedzi na swoje posty
    
    
    Dwa pozostale klastry to komentatorzy:
        1: Roznia sie tym ze jeden pisze komentarze czesto jednak nie sa one bardzo dlugie
        2: Drugi za to pisze komentarze rzadziej jednak są one dłuższe
            
                                                0           1          2
            q3_post_frequency	amax	72.000000	0.000000	0.000000
            number_of_received_responses_to_users_posts_max	mean	14.815977	0.000000	0.000000
            number_of_received_responses_under_users_comme...	mean	0.000000	1.011677	0.416393
            number_of_words_in_own_responses_of_users_post...	mean	0.000000	0.000000	0.000000
            number_of_words_in_users_comments_avg	mean	29.459760	32.454975	139.903049
            mean_comments_frequency	mean	1.656048	14.616607	2.638158
            
        
    Cluster for 2009-12-22:
        
        W tym miesiacu mamy dwie role postujace 0 i 1 oraz jedna komentujaca 2
        Rola 1 otrzymuje czesciej odpowiedzi do swoich postow w porownaniu z rola 0
        1 nie udziela sie w dyskusji pod swoimi postami, 0 bierze udzial jednak niewielki
        2 rola pisze jedynie komentarze, sa one dlugie nie pisze on bardzo czesto
        0 pisze rowniez komentarze, dosc czesto
        
                                    0           1           2
            
        q3_post_frequency	mean	0.018265	0.171450	0.000000
        number_of_received_responses_to_users_posts_max	mean	0.130266	10.817837	0.000000
        number_of_received_responses_under_users_comments	mean	1.011674	0.000000	0.308072
        number_of_words_in_own_responses_of_users_post	mean	0.020763	0.000000	0.000000
        number_of_words_in_users_comments_avg	mean	36.079365	31.883340	150.996503
        mean_comments_frequency	mean	7.327403	1.375606	1.576655
        
        
Potencjalnie rola 2 z 2009-12-22 i 2 z 2009-12-08 to te same role 
        


In [None]:
display_stats(start_date_clustering, end_date_clustering)