In [55]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

df_dp = pd.read_csv(os.path.realpath('data/dp_raw_old.csv'))
df_se = pd.read_csv(os.path.realpath('data/se_raw_old.csv'))

print(df_dp.columns, "\n\n")

print(df_se.columns)

Index(['user_id', 'question_id', 'question_title', 'question_text',
       'answer_id', 'answer_text', 'answer_score', 'user_tag_count',
       'user_tag_score', 'answer_score_mean', 'sum_class',
       'answer_score_median', 'mean_class', 'median_class'],
      dtype='object') 


Index(['user_id', 'question_id', 'question_title', 'question_text',
       'answer_id', 'answer_text', 'answer_score', 'user_answer_count',
       'user_answer_score', 'answer_score_mean', 'sum_class',
       'answer_score_median', 'median_class', 'mean_class'],
      dtype='object')


## RENAME AND REORDER COLUMNS

In [56]:
df_dp.rename(columns={'user_tag_score':'user_sum_score',
                      'user_tag_count':'user_answer_count',
                      'answer_score_mean':'user_mean_score',
                      'answer_score_median':'user_median_score'}, inplace=True)

df_dp = df_dp[['user_id', 'question_id', 'question_title', 'question_text',
       'answer_id', 'answer_text', 'answer_score', 'user_answer_count',
       'user_sum_score', 'user_mean_score', 'user_median_score',
       'sum_class', 'mean_class', 'median_class']]

print(df_dp.columns, "\n\n")


df_se.rename(columns={'user_answer_score':'user_sum_score',
                      'answer_score_mean':'user_mean_score',
                      'answer_score_median':'user_median_score'}, inplace=True)

df_se = df_se[['user_id', 'question_id', 'question_title', 'question_text',
       'answer_id', 'answer_text', 'answer_score', 'user_answer_count',
       'user_sum_score', 'user_mean_score', 'user_median_score',
       'sum_class', 'mean_class', 'median_class']]

print(df_se.columns)

Index(['user_id', 'question_id', 'question_title', 'question_text',
       'answer_id', 'answer_text', 'answer_score', 'user_answer_count',
       'user_sum_score', 'user_mean_score', 'user_median_score', 'sum_class',
       'mean_class', 'median_class'],
      dtype='object') 


Index(['user_id', 'question_id', 'question_title', 'question_text',
       'answer_id', 'answer_text', 'answer_score', 'user_answer_count',
       'user_sum_score', 'user_mean_score', 'user_median_score', 'sum_class',
       'mean_class', 'median_class'],
      dtype='object')


In [70]:
use_dp = True

if use_dp:
    df = df_dp
else:
    df = df_se

df_f = df[(df['user_answer_count'] >= 5)].copy()
df_e = df[(df['user_answer_count'] < 5)].copy()

df_filtered = df_f
df_eliminated = df_e

df_users = df_filtered.groupby('user_id')['user_id'].max()
print("num of answers after filtering: {}".format(len(df_filtered)))
print("num of users after filtering: {}\n".format(len(df_users)))    

scorings = ['user_median_score', 'user_mean_score', 'user_sum_score']
labelings = ['median_class', 'mean_class', 'sum_class']

user_scores_dict = {}
    
for scoring in scorings:

    print('********** {} *********'.format(scoring))

    print("num of positives: {}".format(len(df_filtered[df_filtered[scoring] > 0])))
    print("num of negatives: {}".format(len(df_filtered[df_filtered[scoring] < 0])))
    print("num of zeros: {}".format(len(df_filtered[df_filtered[scoring] == 0])))
    print("min score: {}".format(min(df_filtered[scoring])))
    print("max score: {}\n".format(max(df_filtered[scoring])))

num of answers after filtering: 22894
num of users after filtering: 1942

********** user_median_score *********
num of positives: 20881
num of negatives: 0
num of zeros: 2013
min score: 0.0
max score: 68.0

********** user_mean_score *********
num of positives: 22816
num of negatives: 0
num of zeros: 78
min score: 0.0
max score: 407.6

********** user_sum_score *********
num of positives: 22816
num of negatives: 0
num of zeros: 78
min score: 0
max score: 4391



In [71]:
def generateLogbins(minvalue,maxvalue,factor,uselinear=True):
    '''Generates a binning vector containing bin limits for log-binning. Inputs:
    minvalue, maxvalue = min and max values of data (e.g. degrees) to be binned,
    factor = multiplicative factor for increasing bin size,
    uselinear=[True|False] for making the first 10 bins linear.'''
    if uselinear:
    # for degree distributions, the first 10 degrees are often
    # binned in linear bins. If so, set uselinear=True
        bins=[-0.5,0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5]
        i=12
    else:
        # set the first bin lower limit s.t. first data point
        # falls in the bin center
        bins=[]
        bins.append(minvalue*2.0/(1+factor))
        i=1
    while bins[i-1]<maxvalue:
        # generate the rest of (log) bin limits. The next bin limit
        # is always the previous limit times the factor
        bins.append(bins[i-1]*factor)
        i+=1

    return bins

bins_dict = {}

for scoring in scorings:
    bins_dict[scoring] = generateLogbins(minvalue=1, maxvalue=max(df_filtered[scoring]),
                                         factor=2.5, uselinear=False)

for scoring in scorings:
    print("******* {} bins ******\n{}\n".format(scoring, bins_dict[scoring], '\n'))

******* user_median_score bins ******
[0.5714285714285714, 1.4285714285714284, 3.571428571428571, 8.928571428571427, 22.32142857142857, 55.80357142857142, 139.50892857142856]

******* user_mean_score bins ******
[0.5714285714285714, 1.4285714285714284, 3.571428571428571, 8.928571428571427, 22.32142857142857, 55.80357142857142, 139.50892857142856, 348.7723214285714, 871.9308035714284]

******* user_sum_score bins ******
[0.5714285714285714, 1.4285714285714284, 3.571428571428571, 8.928571428571427, 22.32142857142857, 55.80357142857142, 139.50892857142856, 348.7723214285714, 871.9308035714284, 2179.827008928571, 5449.5675223214275]



In [72]:
def findbin(bins, value):
    """ Returns LABEL for a score """
    lowerlimit=0
    upperlimit=len(bins)-1

    while (upperlimit-lowerlimit)>1:
        halfpoint=int(math.ceil(0.5*(upperlimit+lowerlimit)))
        if (value>=bins[halfpoint]):
            lowerlimit=halfpoint
        else:
            upperlimit=halfpoint

    return lowerlimit

In [73]:
labels = {}

for scoring,labeling in zip(scorings,labelings):
    labels[scoring] = []
    for score in df_filtered[scoring]:
        labels[scoring].append(findbin(bins_dict[scoring], score))

    df_filtered[labeling] = labels[scoring]
    unique, counts = np.unique(labels[scoring], return_counts=True)
    d = dict(zip(unique, counts))
    print("\n{}: {}".format(scoring, d))


user_median_score: {0: 12692, 1: 8965, 2: 1159, 3: 73, 5: 5}

user_mean_score: {0: 6255, 1: 9502, 2: 5102, 3: 1638, 4: 292, 5: 94, 6: 6, 7: 5}

user_sum_score: {0: 163, 1: 476, 2: 2257, 3: 5452, 4: 5432, 5: 4137, 6: 2882, 7: 1697, 8: 195, 9: 203}


## MERGE SMALL CLASSES

In [74]:
# always recall the previous cell to avoid re-merging!

if use_dp:
    df_filtered.loc[(df_filtered.median_class > 2), 'median_class'] = 2
    df_filtered.loc[(df_filtered.mean_class > 3), 'mean_class'] = 3
    df_filtered.loc[(df_filtered.sum_class > 7), 'sum_class'] = 7
    df_filtered.loc[(df_filtered.sum_class < 2), 'sum_class'] = 1
    df_filtered['sum_class'] = df_filtered['sum_class']-1
else:
    df_filtered.loc[(df_filtered.median_class > 3), 'median_class'] = 3
    df_filtered.loc[(df_filtered.mean_class > 4), 'mean_class'] = 4
    df_filtered.loc[(df_filtered.sum_class > 10), 'sum_class'] = 10
    df_filtered.loc[(df_filtered.sum_class < 2), 'sum_class'] = 2
    df_filtered['sum_class'] = df_filtered['sum_class']-2

# append eliminated answers with -1 label
for labeling in labelings:
    df_eliminated[labeling] = -1
df_merged = pd.concat([df_filtered, df_eliminated])
    
for labeling in labelings:
    print("\n********** BY {} **********".format(labeling))
    x = df_merged.groupby(labeling)['answer_id'].count().reset_index(name ='answers')
    y = df_merged.groupby(labeling)['user_id'].nunique().reset_index(name ='users')
    x['users'] = y['users']
    print(x.head(15))



********** BY median_class **********
   median_class  answers  users
0            -1    36246  27239
1             0    12692   1093
2             1     8965    726
3             2     1237    123

********** BY mean_class **********
   mean_class  answers  users
0          -1    36246  27239
1           0     6255    612
2           1     9502    804
3           2     5102    359
4           3     2035    167

********** BY sum_class **********
   sum_class  answers  users
0         -1    36246  27239
1          0      639    108
2          1     2257    349
3          2     5452    672
4          3     5432    457
5          4     4137    226
6          5     2882     92
7          6     2095     38


In [75]:
if use_dp:
    df_merged.to_csv('./data/dp_raw_new.csv', sep=',', index=False)
else:
    df_merged.to_csv('./data/se_raw_new.csv', sep=',', index=False)