In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime
from datetime import timedelta 

#### Generete features with only active users

In [None]:
start_date_comments = datetime.date(2009, 12, 22)
end_date_comments = datetime.date(2013,10, 22)

In [None]:
feature_table = pd.read_csv("FeatureTableCSV.csv", sep=";")

In [None]:
feature_table

In [None]:
def save_data_to_file(folder_name, file_name, data):
    try:
        # Create target Directory
        os.mkdir(folder_name)
        print("Directory " , folder_name ,  " Created ") 
    except FileExistsError:
        e = 1
        #print("Directory " , folder_name ,  " already exists")
    data.to_csv(folder_name + "/" + file_name, index=False)

In [None]:
def merge_stats(start_date):
    statistics_to_join = []
    for index, row in feature_table.iterrows():
        folder = row[["NazwaFolderu","NazwaCechy"]][0]
        feature_name = row[["NazwaFolderu","NazwaCechy"]][1]
        statistics_to_join.append(pd.read_csv(folder + "/feature_" + str(start_date) + ".csv")[['user_id',feature_name]])
    
    merged_df = statistics_to_join[0]
    for i in range(1, len(statistics_to_join)):
        merged_df = pd.merge(merged_df, statistics_to_join[i],how='outer',on=['user_id'])
    return merged_df.fillna(0)

In [None]:
def create_active_users_data(start_date, end_date):
    while start_date < end_date:
        data = merge_stats(start_date)
        print(str(start_date))
        save_data_to_file("All_Data_In_Slots_Joined", str(start_date) + "_joined.csv", data)
        start_date += timedelta(days=14)

In [None]:
create_active_users_data(start_date_comments, end_date_comments)

#### Analysis of features

In [None]:
def non_zero_standard_deviation(features, feature_name):
    data = features[feature_name]
    non_zero_data = data.loc[data != 0.0]
    std = non_zero_data.std()
    if std is None:
        std = 0
    return std

In [None]:
def zero_standard_deviation(features, feature_name):
    data = features[feature_name]
    return data.std()

In [None]:
def precentage_of_non_zero_values(features, feature_name):
    data = features[feature_name]
    non_zero_data = data.loc[data != 0.0]
    return len(non_zero_data.index) / len(data.index) * 100

In [None]:
def plot_precentage_of_non_zero_values(result, feature_name):
    plt.figure(figsize=(15,10))
    plt.rc('xtick',labelsize=16)
    plt.rc('ytick',labelsize=16)
    plt.plot(result, color='g', linewidth=3)
    plt.title(f"{feature_name}", fontsize=20)
    plt.ylabel('Percentage of non zero values'.format(feature_name), fontsize=20)
    plt.xlabel('Slot number'.format(feature_name), fontsize=20)
    plt.savefig("FeatureAnalysisFigs/" + "PROCENTAGE_" + feature_name, dpi=300)
    plt.show()

In [None]:
def plot_zero_standard_deviation(result, feature_name):
    plt.figure(figsize=(15,10))
    plt.rc('xtick',labelsize=16)
    plt.rc('ytick',labelsize=16)
    plt.plot(result, color='b', linewidth=3)
    plt.title(f"{feature_name}", fontsize=20)
    plt.ylabel('Standard deviation'.format(feature_name), fontsize=20)
    plt.xlabel('Slot number'.format(feature_name), fontsize=20)
    plt.savefig("FeatureAnalysisFigs/" + "STD_" + feature_name)
    plt.show()

In [None]:
def plot_non_zero_standard_deviation(result, feature_name):
    plt.figure(figsize=(15,10))
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    plt.plot(result, color='r', linewidth=3)
    plt.title(f"None zero standard deviation of {feature_name}", fontsize=20, fontweight='bold')
    plt.ylabel('Standard deviation of none zero values'.format(feature_name), fontsize=20)
    plt.xlabel('Slot number'.format(feature_name), fontsize=20)
    plt.savefig("FeatureAnalysisFigs/" + "NONZERO_STD_" + feature_name, dpi=300)
    plt.show()

In [None]:
def plot_histogram(res, feature_name):
    plt.gcf().clear()
#     binwidth = max(res) / 100
#     bins=range(min(res), max(res) + 20, 20)
#     plt.gca().set_xlim([0, max(res)])
    
    plt.figure(figsize=(15,10))
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    plt.yscale('log', nonposy='clip')
    plt.hist(res.T, label=feature_name, histtype='bar', bins=100, alpha=0.5, edgecolor='black', facecolor='blue')
    plt.legend(prop={'size': 16})
    plt.title(f"Histogram of {feature_name}", fontsize=20, fontweight='bold')
    plt.savefig("FeatureAnalysisFigs/" + "HIST_" + feature_name, dpi=300)
    plt.show()

In [None]:
def generate_feature_charts_and_statistics(feature_table, start_date, end_date):
    
    feature_names = list(feature_table['NazwaCechy'])
#     feature_names = ["number_of_received_responses_under_comments_avg", 
#                      "number_of_received_responses_under_comments_stddev",
#                     "number_of_received_responses_under_comments_median",
#                     "number_of_received_responses_under_comments_q3",
#                     "number_of_received_responses_under_comments_max",
#                     "number_of_received_responses_under_comments_min"]
    
    for f_name in feature_names:
        print(f_name)
        non_zero_stddev_res = []
        zero_stddev_res = []
        precentage_res = []
        histogram_res = pd.DataFrame()
        start = start_date
        
        while start < end_date:
            features = pd.read_csv("All_Data_In_Slots_Joined/" + str(start) + "_joined.csv")
            
            non_zero_stddev_res.append(non_zero_standard_deviation(features, f_name))
            #zero_stddev_res.append(zero_standard_deviation(features, f_name))
            #precentage_res.append(precentage_of_non_zero_values(features, f_name))
            histogram_res = pd.concat([histogram_res, features[f_name]], ignore_index=True)
            start += timedelta(days=14)
            
        plot_non_zero_standard_deviation(non_zero_stddev_res, f_name)
        #plot_zero_standard_deviation(zero_stddev_res, f_name)
        #plot_precentage_of_non_zero_values(precentage_res, f_name)
        plot_histogram(histogram_res, f_name)

In [None]:
generate_feature_charts_and_statistics(feature_table, start_date_comments, end_date_comments)

In [None]:
def generate_stats_for_entire_data(feature_table, start_date, end_date):
    results = pd.DataFrame()
    while start_date < end_date:
        features = pd.read_csv("All_Data_In_Slots_Joined/" + str(start_date) + "_joined.csv")
        results = pd.concat([results, features], ignore_index=True)
        start_date += timedelta(days=14)
    
    feature_names = list(feature_table['NazwaCechy'])

    for f_name in feature_names:
        results
        print("TYPE       | " + f_name )
        print("STD ZERO   | " + str(zero_standard_deviation(results, f_name)))
        print("STD NONZERO| " + str(non_zero_standard_deviation(results, f_name)))


In [None]:
generate_stats_for_entire_data(feature_table, start_date_comments, end_date_comments)