In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [None]:
import datetime
from datetime import timedelta 

# Wnioski:

## 1. Porównanie kwantyli (0.1, 0.25, 0.50, 0.75, 0.90)

#### Z porównania kwantyli wynika, że największe odchylenie standardowe dla większości cech ma kwantyl rzedu 0.90
#### {'_10.0': 2, '_25.0': 1, '_50.0': 0, '_75.0': 1, '_90.0': 36} - w przypadku 36 cech miał największe odchylenie standardowe
#### z wykresów poniżej również widać, że jest on najbardziej zróżnicowany
#### nanosząc wartości stddev cech poszczególnych kwantyli widać również że 0.90 ma najwyższe wyniki

## 2. Korelacja cech

#### W przypadku sprawdzenia korelacji między cechami z wykorzystaniem kwantyla rzędu 0.90 otrzymałem 14 cech



DLA 0.90:


| Moje                                                         | Klaudii                              |
|--------------------------------------------------------------|--------------------------------------|
|                'std_post_frequency'                          | posts freq stddev
|                'q3_post_frequency',                          | posts freq q3
|          'number_of_received_responses_to_users_posts_std',  | received posts responses stddev
|'number_of_received_responses_to_users_posts_max',            | received posts responses max
| 'number_of_received_responses_under_users_comments_q3',      | received comment responses q3
| 'number_of_received_responses_under_users_comments_max',     | received comment responses max
| 'number_of_words_in_own_responses_of_users_posts_q3',        | own posts responses word count q3
| 'nnumber_of_words_in_responses_of_users_posts_median',       | written comments word count median
|--------------------------------------------------------------|---------------------------------------
|                   Różnica                                    |              Różnica
|--------------------------------------------------------------|---------------------------------------
| 'posts_sentiment_min'                                        | own posts responses q3
| 'mean_comments_frequency',                                   | written comments number
|'number_of_words_in_users_comments_avg',                      | posts number
| 'number_of_words_in_users_posts_q3',                         | received posts responses avg
|'number_of_received_responses_to_users_posts_q3',             | comments freq q3
| 'number_of_comments_written_by_user_under_his_own_posts_q3', | posts activity time
|                                                              | posts word count avg
|                                                              | received posts responses word count q3


DLA 0.1:

 
 
 | Moje                                                        | Klaudii                                      |
|--------------------------------------------------------------|----------------------------------------------|
|                'std_post_frequency'                          | posts freq stddev
|                'q3_post_frequency',                          | posts freq q3
|          'number_of_received_responses_to_users_posts_std',  | received posts responses stddev
|'number_of_received_responses_to_users_posts_max',            | received posts responses max
| 'number_of_words_in_own_responses_of_users_posts_q3',        | own posts responses word count q3
|               'post_activity'                                |      posts activity time
|               'number_of_posts'                              |      posts number
|  'number_of_comments_written_by_user_under_his_own_posts_q3' |      own posts responses q3
|   'number_of_words_in_responses_of_users_posts_q3'           |      received posts responses word count q3
| 'number_of_received_responses_to_users_posts_q3'             |    received comment responses q3
|--------------------------------------------------------------|---------------------------------------
|                   Różnica                                    |              Różnica
|--------------------------------------------------------------|---------------------------------------
|   received_comments_sentiment_q3                             | written comments word count median
| posts_sentiment_min                                          | written comments number
| 'mean_comments_frequency',                                   | received posts responses avg
|'number_of_words_in_users_posts_q3',                          | comments freq q3
| 'number_of_words_in_responses_of_users_posts_std',           | written comments word count median
|      posts_sentiment_median                                  | posts word count avg
|               comments_sentiment_median              | 
|                 received_comments_sentiment_stddev     | 
|                                | 


# CZĘŚĆ Z KODEM (Macierz korelacji oraz wykresy)

### Sprawdzenie korelacji między cechami i wybranie najlepszych cech przy użyciu algorytmu Klaudii

In [None]:
start_date_comments = datetime.date(2008, 12, 9)
end_date_comments = datetime.date(2013,11, 16)

In [None]:
%%capture capt

def count_quantile(file_names, columns, quantile, start_date, end_date):
    all_data = []
    while start_date < end_date:
        quantiles_results = []
        for file, feature_name in file_names:
            df = pd.read_csv(file + "/feature_" + str(start_date) + ".csv")
            quant = df[feature_name].quantile(quantile)
            quant = 0.0 if math.isnan(quant) else quant
            quantiles_results.append(quant)

        all_data.append(quantiles_results)
        #print("Appending results" + " for: " + str(start_date))
        start_date += timedelta(days=14)
    columns = [col + "_" + str(quantile*100) for col in columns]
    result = pd.DataFrame(all_data, columns = columns)
    return result

In [None]:
def count_mean(file_names, columns, start_date, end_date):
    all_data = []
    while start_date < end_date:
        mean_results = []
        for file, feature_name in file_names:
            df = pd.read_csv(file + "/feature_" + str(start_date) + ".csv")
            mean = df[feature_name].mean()
            mean = 0.0 if math.isnan(mean) else mean
            mean_results.append(mean)

        all_data.append(mean_results)
        #print("Appending results" + " for: " + str(start_date))
        start_date += timedelta(days=14)
    columns = [col + "_mean" for col in columns]
    result = pd.DataFrame(all_data, columns = columns)
    return result

In [None]:
def plot_correlation_matrix(file_names, columns, quantile, start_date, end_date):
    result = count_quantile(file_names, columns, quantile, start_date, end_date)
    print("Counting correlations...")
    plt.matshow(result.corr())
    plt.show()
    return result

In [None]:
def plot_mean_correlation_matrix(file_names, columns, start_date, end_date):
    result = count_mean(file_names, columns, start_date, end_date)
    print("Counting correlations...")
    plt.matshow(result.corr())
    plt.show()
    return result

In [None]:
columns = [
    "post_activity",
    "number_of_posts",
    "number_of_written_comments_to_other_users_posts",
    "mean_post_frequency",
    "std_post_frequency",
    "median_post_frequency",
    "q3_post_frequency",
    "number_of_received_responses_to_users_posts_avg",
    "number_of_received_responses_to_users_posts_std",
    "number_of_received_responses_to_users_posts_q3",
    "number_of_received_responses_to_users_posts_max",
    "number_of_comments_written_by_user_under_his_own_posts_avg",
    "number_of_comments_written_by_user_under_his_own_posts_q3",
    "number_of_received_responses_under_users_comments_avg",
    "number_of_received_responses_under_users_comments_median",
    "number_of_received_responses_under_users_comments_q3",
    "number_of_received_responses_under_users_comments_max",
    "mean_comments_frequency",
    "median_comments_frequency",
    "q3_comments_frequency",
    "number_of_words_in_users_comments_avg",
    "number_of_words_in_users_comments_median",
    "number_of_words_in_users_comments_q3",
    "number_of_words_in_users_posts_avg",
    "number_of_words_in_users_posts_median",
    "number_of_words_in_users_posts_q3",
    "number_of_words_in_responses_of_users_posts_avg",
    "number_of_words_in_responses_of_users_posts_std",
    "nnumber_of_words_in_responses_of_users_posts_median",
    "number_of_words_in_responses_of_users_posts_q3",
    "number_of_words_in_own_responses_of_users_posts_avg",
    "number_of_words_in_own_responses_of_users_posts_q3",
    "posts_sentiment_avg",
    "posts_sentiment_median",
    "posts_sentiment_min",
    "comments_sentiment_avg",
    "comments_sentiment_median",
    "comments_sentiment_q3",
    "received_comments_sentiment_stddev",
    "received_comments_sentiment_q3"
]



files_without_zeros = [
    ("PostActivityWithoutZeros", "post_activity"),
    ("NumberOfUsersPostsWithoutZeros", "number_of_posts"),
    ("NumberOfWrittenCommentsToOtherUsersPostsWithoutZeros","number_of_written_comments_to_other_users_posts"),
    ("FrequencyOfUserPostsWithoutZeros", "mean_post_frequency"),
    ("FrequencyOfUserPostsWithoutZeros","std_post_frequency"),
    ("FrequencyOfUserPostsWithoutZeros", "median_post_frequency"),
    ("FrequencyOfUserPostsWithoutZeros","q3_post_frequency"),
    ("NumberOfReceivedResponsesToUsersPostsWithoutZeros", "number_of_received_responses_to_users_posts_avg"),
    ("NumberOfReceivedResponsesToUsersPostsWithoutZeros","number_of_received_responses_to_users_posts_std"),
    ("NumberOfReceivedResponsesToUsersPostsWithoutZeros", "number_of_received_responses_to_users_posts_q3"),
    ("NumberOfReceivedResponsesToUsersPostsWithoutZeros","number_of_received_responses_to_users_posts_max"),
    ("NumberOfCommentsWrittenByUserUnderHisOwnPostsWithoutZeros", "number_of_comments_written_by_user_under_his_own_posts_avg"),
    ("NumberOfCommentsWrittenByUserUnderHisOwnPostsWithoutZeros","number_of_comments_written_by_user_under_his_own_posts_q3"),
    ("NumberOfReceivedResponsesUnderUsersCommentsWithoutZeros", "number_of_received_responses_under_users_comments_avg"),
    ("NumberOfReceivedResponsesUnderUsersCommentsWithoutZeros","number_of_received_responses_under_users_comments_median"),
    ("NumberOfReceivedResponsesUnderUsersCommentsWithoutZeros", "number_of_received_responses_under_users_comments_q3"),
    ("NumberOfReceivedResponsesUnderUsersCommentsWithoutZeros","number_of_received_responses_under_users_comments_max"),
    ("FrequencyOfUserCommentsWithoutZeros", "mean_comments_frequency"),
    ("FrequencyOfUserCommentsWithoutZeros","median_comments_frequency"),
    ("FrequencyOfUserCommentsWithoutZeros", "q3_comments_frequency"),
    ("NumberOfWordsInUsersCommentsWithoutZeros","number_of_words_in_users_comments_avg"),
    ("NumberOfWordsInUsersCommentsWithoutZeros", "number_of_words_in_users_comments_median"),
    ("NumberOfWordsInUsersCommentsWithoutZeros","number_of_words_in_users_comments_q3"),
    ("NumberOfWordsInUsersPostsWithoutZeros", "number_of_words_in_users_posts_avg"),
    ("NumberOfWordsInUsersPostsWithoutZeros","number_of_words_in_users_posts_median"),
    ("NumberOfWordsInUsersPostsWithoutZeros", "number_of_words_in_users_posts_q3"),
    ("NumberOfWordsInResponsesOfUsersPostsWithoutZeros","number_of_words_in_responses_of_users_posts_avg"),
    ("NumberOfWordsInResponsesOfUsersPostsWithoutZeros", "number_of_words_in_responses_of_users_posts_std"),
    ("NumberOfWordsInResponsesOfUsersPostsWithoutZeros","nnumber_of_words_in_responses_of_users_posts_median"),
    ("NumberOfWordsInResponsesOfUsersPostsWithoutZeros", "number_of_words_in_responses_of_users_posts_q3"),
    ("NumberOfWordsInOwnResponsesOfUsersPostsWithoutZeros","number_of_words_in_own_responses_of_users_posts_avg"),
    ("NumberOfWordsInOwnResponsesOfUsersPostsWithoutZeros", "number_of_words_in_own_responses_of_users_posts_q3"),
    ("SentimentOfUsersPostsWithoutZeros","posts_sentiment_avg"),
    ("SentimentOfUsersPostsWithoutZeros", "posts_sentiment_median"),
    ("SentimentOfUsersPostsWithoutZeros","posts_sentiment_min"),
    ("SentimentOfUsersCommentsWithoutZeros", "comments_sentiment_avg"),
    ("SentimentOfUsersCommentsWithoutZeros","comments_sentiment_median"),
    ("SentimentOfUsersCommentsWithoutZeros", "comments_sentiment_q3"),
    ("SentimentOfUsersResponsesToUserPostsWithoutZeros","received_comments_sentiment_stddev"),
    ("SentimentOfUsersResponsesToUserPostsWithoutZeros", "received_comments_sentiment_q3")
]

files = [
    ("NumberOfUsersPosts", "number_of_posts"),
    ("NumberOfWrittenCommentsToOtherUsersPosts","number_of_written_comments_to_other_users_posts"),
    ("FrequencyOfUserPosts", "mean_post_frequency"),
    ("FrequencyOfUserPosts","std_post_frequency"),
    ("FrequencyOfUserPosts", "median_post_frequency"),
    ("FrequencyOfUserPosts","q3_post_frequency"),
    ("NumberOfReceivedResponsesToUsersPosts", "number_of_received_responses_to_users_posts_avg"),
    ("NumberOfReceivedResponsesToUsersPosts","number_of_received_responses_to_users_posts_std"),
    ("NumberOfReceivedResponsesToUsersPosts", "number_of_received_responses_to_users_posts_q3"),
    ("NumberOfReceivedResponsesToUsersPosts","number_of_received_responses_to_users_posts_max"),
    ("NumberOfCommentsWrittenByUserUnderHisOwnPosts", "number_of_comments_written_by_user_under_his_own_posts_avg"),
    ("NumberOfCommentsWrittenByUserUnderHisOwnPosts","number_of_comments_written_by_user_under_his_own_posts_q3"),
    ("NumberOfReceivedResponsesUnderUsersComments", "number_of_received_responses_under_users_comments_avg"),
    ("NumberOfReceivedResponsesUnderUsersComments","number_of_received_responses_under_users_comments_median"),
    ("NumberOfReceivedResponsesUnderUsersComments", "number_of_received_responses_under_users_comments_q3"),
    ("NumberOfReceivedResponsesUnderUsersComments","number_of_received_responses_under_users_comments_max"),
    ("FrequencyOfUserComments", "mean_comments_frequency"),
    ("FrequencyOfUserComments","median_comments_frequency"),
    ("FrequencyOfUserComments", "q3_comments_frequency"),
    ("NumberOfWordsInUsersComments","number_of_words_in_users_comments_avg"),
    ("NumberOfWordsInUsersComments", "number_of_words_in_users_comments_median"),
    ("NumberOfWordsInUsersComments","number_of_words_in_users_comments_q3"),
    ("NumberOfWordsInUsersPosts", "number_of_words_in_users_posts_avg"),
    ("NumberOfWordsInUsersPosts","number_of_words_in_users_posts_median"),
    ("NumberOfWordsInUsersPosts", "number_of_words_in_users_posts_q3"),
    ("NumberOfWordsInResponsesOfUsersPosts","number_of_words_in_responses_of_users_posts_avg"),
    ("NumberOfWordsInResponsesOfUsersPosts", "number_of_words_in_responses_of_users_posts_std"),
    ("NumberOfWordsInResponsesOfUsersPosts","nnumber_of_words_in_responses_of_users_posts_median"),
    ("NumberOfWordsInResponsesOfUsersPosts", "number_of_words_in_responses_of_users_posts_q3"),
    ("NumberOfWordsInOwnResponsesOfUsersPosts","number_of_words_in_own_responses_of_users_posts_avg"),
    ("NumberOfWordsInOwnResponsesOfUsersPosts", "number_of_words_in_own_responses_of_users_posts_q3"),
    ("SentimentOfUsersPosts","posts_sentiment_avg"),
    ("SentimentOfUsersPosts", "posts_sentiment_median"),
    ("SentimentOfUsersPosts","posts_sentiment_min"),
    ("SentimentOfUsersComments", "comments_sentiment_avg"),
    ("SentimentOfUsersComments","comments_sentiment_median"),
    ("SentimentOfUsersComments", "comments_sentiment_q3"),
    ("SentimentOfUsersResponsesToUserPosts","received_comments_sentiment_stddev"),
    ("SentimentOfUsersResponsesToUserPosts", "received_comments_sentiment_q3")
]

In [None]:
result_with_quant_90 = plot_correlation_matrix(files_without_zeros, columns, 0.90, start_date_comments, end_date_comments)

In [None]:
%%capture capt

result_with_quant_25 = plot_correlation_matrix(files_without_zeros, columns, 0.25, start_date_comments, end_date_comments)
result_with_quant_10 = plot_correlation_matrix(files_without_zeros, columns, 0.10, start_date_comments, end_date_comments)
result_with_quant_50 = plot_correlation_matrix(files_without_zeros, columns, 0.50, start_date_comments, end_date_comments)
result_with_quant_750 = plot_correlation_matrix(files_without_zeros, columns, 0.75, start_date_comments, end_date_comments)

In [None]:
result_with_mean = plot_mean_correlation_matrix(files_without_zeros, columns, start_date_comments, end_date_comments)

In [None]:
from collections import Counter

In [None]:
def get_high_corr_features(features, threshold=0.9) -> list(((str, str), float)):
    corr = features.corr()
    high_corr = {}
    for index, row in corr.iterrows():
        for col_name, val in row.iteritems():
            if val > threshold and col_name != index:
                if (col_name, index) not in high_corr:
                    high_corr[index, col_name] = val
    return high_corr.items()

In [None]:
def corr_features_selection(features, threshold=0.9):
    features_corr = get_high_corr_features(features, threshold=threshold)
    features_corr = sorted(features_corr, key=lambda x: x[1], reverse=True)
    removed = set()
    chosen = set()
    while len(features_corr) > 0:
        features_corr = sorted(features_corr, key=lambda x: x[1], reverse=True)
        a, b = features_corr[0][0]
        if a in removed and b in removed:
            features_corr.pop(0)
        else:
            if a in removed:
                chosen_feature = b
            elif b in removed:
                chosen_feature = a
            else:
                x = [elem[0][0] for elem in features_corr]
                x += [elem[0][1] for elem in features_corr]
                counter = Counter(x)
                if counter[a] == counter[b]:
                    a_var = np.std(features[a])
                    b_var = np.std(features[b])
                    chosen_feature = a if a_var > b_var else b
                else:
                    chosen_feature = a if counter[a] > counter[b] else b
            chosen.add(chosen_feature)
            for i,f in enumerate(features_corr):
                x = f[0][0]
                y = f[0][1]
                if x == chosen_feature:
                    removed.add(y)
                elif y == chosen_feature:
                    removed.add(x)
            features_corr = list(filter(lambda x: x[0][0] != chosen_feature and x[0][1] != chosen_feature, features_corr))
    return removed

In [None]:
def get_chosen(features, threshold=0.9):
    to_rm = corr_features_selection(features, threshold=threshold)
    chosen = []
    for c in features.columns:
        if c not in to_rm:
            chosen.append(c)
    return features[chosen]

In [None]:
%%capture capt

chosen_10 = get_chosen(result_with_quant_10)
chosen_25 = get_chosen(result_with_quant_25)
chosen_50 = get_chosen(result_with_quant_50)
chosen_75 = get_chosen(result_with_quant_75)
chosen_90 = get_chosen(result_with_quant_90)

In [None]:
chosen_mean = get_chosen(result_with_mean)

In [None]:
list(chosen_mean)

In [None]:
list(chosen_10)

In [None]:
list(chosen_25)

In [None]:
list(chosen_50)

In [None]:
list(chosen_75)

In [None]:
list(chosen_90)

### Porównanie kwantyli

In [None]:
# %%capture capt
# result_with_quant_10 = count_quantile(files_without_zeros, columns, 0.10, start_date_comments, end_date_comments)
# result_with_quant_25 = count_quantile(files_without_zeros, columns, 0.25, start_date_comments, end_date_comments)
# result_with_quant_50 = count_quantile(files_without_zeros, columns, 0.50, start_date_comments, end_date_comments)
# result_with_quant_75 = count_quantile(files_without_zeros, columns, 0.75, start_date_comments, end_date_comments)
# result_with_quant_90 = count_quantile(files_without_zeros, columns, 0.90, start_date_comments, end_date_comments)

In [None]:
def merge_all_dfs(dfs):
    start_df = dfs[0]
    for i in range(1, len(dfs)):
        start_df = pd.merge(start_df, dfs[i], right_index=True, left_index=True)
    return start_df

In [None]:
dfs = [result_with_quant_10,result_with_quant_25,result_with_quant_50,result_with_quant_75,result_with_quant_90]
merged = merge_all_dfs(dfs)

In [None]:
def plot_quantiles(df, column_name):
    ax = plt.gca()
    cols = ["_10.0", "_25.0", "_50.0", "_75.0", "_90.0"]
    cols = [column_name + c for c in cols]
    
   
    
    df.plot(figsize=(34,17),kind='line',y=cols[0], color='blue', ax=ax)
    df.plot(figsize=(34,17),kind='line',y=cols[1], color='red', ax=ax)
    df.plot(figsize=(34,17),kind='line',y=cols[2], color='green', ax=ax)
    df.plot(figsize=(34,17),kind='line',y=cols[3], color='c', ax=ax)
    df.plot(figsize=(34,17),kind='line',y=cols[4], color='y', ax=ax)
    
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.legend(loc=0, fontsize=25)

    plt.show()

In [None]:
for column_name in columns:
    plot_quantiles(merged, column_name)

In [None]:
stats = merged.describe()
stats = stats.reindex(sorted(stats.columns), axis=1)

In [None]:
stats

In [None]:
quant = ["_10.0", "_25.0", "_50.0", "_75.0", "_90.0"]
result = {"_10.0":0, "_25.0":0, "_50.0":0, "_75.0":0, "_90.0":0}
for col in columns:
    arr = np.array([])
    maxi = 0.0
    key = quant[0]
    for q in quant:
        if stats[col + q]["std"] > maxi:
            maxi = stats[col + q]["std"]
            key = q
    result[key]+=1
    
result

 
        

In [None]:
q10 = []
q25 = []
q50 = []
q75 = []
q90 = []

for col in columns:
    q10.append(stats[col + "_10.0"]["std"])
    q25.append(stats[col + "_25.0"]["std"])
    q50.append(stats[col + "_50.0"]["std"])
    q75.append(stats[col + "_75.0"]["std"])
    q90.append(stats[col + "_90.0"]["std"])

ax = plt.gca()

plt.plot(q10, "blue", label="q10")
plt.plot(q25, "red", label="q25")
plt.plot(q50, "green", label="q50")
plt.plot(q75, "c", label="q75")
plt.plot(q90, "y", label="q90")
plt.title("Porównanie stddev cech dla poszczególnych kwantyli")
plt.legend()
plt.show()

