In [10]:
import pandas as pd
from scipy import stats
import numpy as np

In [3]:
data = pd.read_csv("/Users/jchan/Projects/Good-Turing/Data Output/Word2VecOnlyWithStemming.csv")

In [7]:
sample = data[(data['dataset'] == "Normal") & (data['method'] == "word2vec_0.20")]

In [9]:
e = stats.expon.fit(sample['GT_predict'], floc=0)
e

(0, 0.021386718749999131)

For each run, calculate:
1. The degree to which the GT estimate matches the true frequency of new ideas in a given time slice (using a correlation)
2. The OLS slope for timeslice as a rough estimate of how much the predicted frequency of new ideas declines with time

In [7]:
evaluation = []
for dataset, dataset_data in data.groupby("dataset"):
    for method, method_data in dataset_data.groupby("method"):
        # correlation with the TRUE
        print dataset, method
        corr_true = stats.linregress(method_data['GT_predict'], method_data['TRUE'])[2]
        # slope
        decline_slope = stats.linregress(method_data['timeSlice'], method_data['GT_predict'])[0]
        print "Correlation with \"TRUE\": ", corr_true
        print "Slope of the decline: ", decline_slope
        evaluation.append({'dataset': dataset, 
                           'method': method.split("_")[0], 
                           'threshold': method.split("_")[1], 
                           'run': method,
                           'corr_true': corr_true,
                           'decline_slope': decline_slope})
evaluation = pd.DataFrame(evaluation)
evaluation

Boring word2vec_0.20
Correlation with "TRUE":  0.549539554188
Slope of the decline:  -0.0011163372859
Boring word2vec_0.30
Correlation with "TRUE":  0.688932742744
Slope of the decline:  -0.003030171278
Boring word2vec_0.40
Correlation with "TRUE":  0.872242256709
Slope of the decline:  -0.00625322793149
Boring word2vec_0.50
Correlation with "TRUE":  0.773201879344
Slope of the decline:  -0.00833860342556
Boring word2vec_0.60
Correlation with "TRUE":  0.809106280985
Slope of the decline:  -0.010942687747
Boring word2vec_0.70
Correlation with "TRUE":  0.85793192614
Slope of the decline:  -0.0141358366271
Boring word2vec_0.80
Correlation with "TRUE":  0.849933796918
Slope of the decline:  -0.0132028985507
Boring word2vec_0.90
Correlation with "TRUE":  0.830987005029
Slope of the decline:  -0.012881027668
Exciting word2vec_0.20
Correlation with "TRUE":  0.309674552916
Slope of the decline:  -0.000101844532279
Exciting word2vec_0.30
Correlation with "TRUE":  0.389884619598
Slope of the dec

Unnamed: 0,corr_true,dataset,decline_slope,method,run,threshold
0,0.54954,Boring,-0.001116,word2vec,word2vec_0.20,0.2
1,0.688933,Boring,-0.00303,word2vec,word2vec_0.30,0.3
2,0.872242,Boring,-0.006253,word2vec,word2vec_0.40,0.4
3,0.773202,Boring,-0.008339,word2vec,word2vec_0.50,0.5
4,0.809106,Boring,-0.010943,word2vec,word2vec_0.60,0.6
5,0.857932,Boring,-0.014136,word2vec,word2vec_0.70,0.7
6,0.849934,Boring,-0.013203,word2vec,word2vec_0.80,0.8
7,0.830987,Boring,-0.012881,word2vec,word2vec_0.90,0.9
8,0.309675,Exciting,-0.000102,word2vec,word2vec_0.20,0.2
9,0.389885,Exciting,-0.000776,word2vec,word2vec_0.30,0.3


In [8]:
evaluation.to_excel("Data Output/ParamSearch_Experiment_evaluation_2.xlsx")

In [9]:
evaluation.to_csv("Data Output/ParamSearch_Experiment_evaluation_2.csv")

In [11]:
for dataset, datasetData in evaluation.groupby("dataset"):
    datasetData.sort_values(by="decline_slope", ascending=False, inplace=True)
    print datasetData.head()

   corr_true dataset  decline_slope    method            run threshold
0   0.549540  Boring      -0.001116  word2vec  word2vec_0.20      0.20
1   0.688933  Boring      -0.003030  word2vec  word2vec_0.30      0.30
2   0.872242  Boring      -0.006253  word2vec  word2vec_0.40      0.40
3   0.773202  Boring      -0.008339  word2vec  word2vec_0.50      0.50
4   0.809106  Boring      -0.010943  word2vec  word2vec_0.60      0.60
    corr_true   dataset  decline_slope    method            run threshold
8    0.309675  Exciting      -0.000102  word2vec  word2vec_0.20      0.20
9    0.389885  Exciting      -0.000776  word2vec  word2vec_0.30      0.30
10   0.749868  Exciting      -0.003179  word2vec  word2vec_0.40      0.40
11   0.737692  Exciting      -0.005971  word2vec  word2vec_0.50      0.50
14   0.768849  Exciting      -0.008663  word2vec  word2vec_0.80      0.80
    corr_true   dataset  decline_slope    method            run threshold
16   0.288769  NewAtEnd      -0.000195  word2vec  word2v

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


# TODO
1. Compute exponent: transform GT_predict to log scale and then compute slope
2. Also get normal slope
3. Maybe compute percent change

In [11]:
# log-transform GT-predict for plotting on logarithmic scale and exponential slope estimation
data['GT_predict_log'] = np.log10(data['GT_predict'])

In [12]:
data.GT_predict_log

0      -1.040959
1      -1.173925
2      -1.040959
3      -1.065502
4      -1.431798
5      -1.455932
6      -1.481486
7      -1.522879
8      -1.552842
9      -1.522879
10     -1.795880
11     -1.769551
12     -1.769551
13     -1.744727
14     -1.657577
15     -1.744727
16     -1.677781
17     -1.721246
18     -1.721246
19     -1.698970
20     -1.721246
21     -1.721246
22     -1.744727
23     -1.769551
24     -1.769551
25     -1.795880
26     -1.795880
27     -1.744727
28     -1.698970
29     -1.721246
          ...   
1770   -0.351640
1771   -0.364516
1772   -0.371611
1773   -0.371611
1774   -0.378824
1775   -0.374688
1776   -0.392545
1777   -0.392545
1778   -0.401209
1779   -0.392545
1780   -0.402305
1781   -0.408935
1782   -0.417937
1783   -0.422508
1784   -0.424812
1785   -0.437707
1786   -0.447332
1787   -0.449772
1788   -0.458421
1789   -0.488117
1790   -0.488117
1791   -0.489455
1792   -0.492144
1793   -0.498941
1794   -0.511449
1795   -0.517126
1796   -0.514279
1797   -0.5157

In [13]:
data.to_csv("Data Output/Word2VecOnlyWithStemming_withlog.csv")