In [10]:
import pandas as pd
import numpy as np
import scipy.stats,pdb,time

In [2]:

def ic(df_factor,df_retpct,days=3,methods ='pearson',return_pvalue=False):
    
    if not (isinstance(df_factor,pd.DataFrame) and  isinstance(df_retpct,pd.DataFrame)):
        raise TypeError("Input must be pd.DataFrame.")
        
    rownum1 = df_factor.shape[0]
    rownum2 = df_retpct.shape[0]
    assert rownum1 == rownum2
    assert (df_factor.index == df_retpct.index).all()
    sr_index = df_factor.index
    
    arr_factor = df_factor.values
    arr_retpct = df_retpct.values
    
    vec_ic = []
    vec_pvalue = []
    for ii in range(rownum1):
#         pdb.set_trace()
        if ii >= rownum1 - days:
            jj = rownum1
        else:
            jj = ii + days
        vec1 = arr_factor[ii,:]
        vec2 = arr_retpct[ii:jj,:]
        
        vec2 = np.nanmean(vec2,axis=0)
        
        vec_notnan = ~np.isnan(vec1)
        vec1 = vec1[vec_notnan]
        vec2 = vec2[vec_notnan]
        
        if methods == 'pearson':
            tup = scipy.stats.pearsonr(vec1,vec2)
        elif methods == 'spearman':
            tup = scipy.stats.spearmanr(vec1,vec2)
        else:
            raise NotImplementedError

        vec_ic.append(tup[0])
        vec_pvalue.append(tup[0])
    #pdb.set_trace()
    if return_pvalue:
        return pd.Series(vec_ic,index=sr_index),pd.Series(vec_pvalue,index=sr_index)
    else:
        return pd.Series(vec_ic,index=sr_index)

In [13]:
dfa = pd.DataFrame(np.random.random_sample((3000,3000)))
dfb = pd.DataFrame(np.random.random_sample((3000,3000)))
sr = ic(dfa,dfb,methods = 'spearman')

In [51]:
%timeit sr = ic(dfa,dfb,methods = 'spearman')

1 loop, best of 3: 4.52 s per loop


In [4]:
%timeit sr = ic(dfa,dfb,methods = 'pearson')

1 loop, best of 3: 678 ms per loop


In [14]:
sr

0       0.011736
1      -0.000020
2      -0.015653
3      -0.018236
4      -0.004440
5      -0.000689
6       0.005790
7      -0.020361
8       0.024530
9      -0.020141
10     -0.005840
11     -0.033519
12     -0.000273
13     -0.027759
14      0.011309
15      0.039405
16      0.003205
17      0.035117
18     -0.000622
19     -0.005202
20     -0.031152
21      0.000101
22      0.009407
23      0.000981
24     -0.007257
25      0.006235
26      0.037974
27      0.011647
28     -0.002294
29      0.005631
          ...   
2970    0.000611
2971   -0.000381
2972    0.009886
2973    0.017941
2974    0.012698
2975   -0.021355
2976   -0.023461
2977    0.004955
2978    0.006543
2979    0.014411
2980    0.030443
2981    0.000056
2982    0.035319
2983   -0.008035
2984   -0.028305
2985   -0.006103
2986    0.004563
2987    0.016395
2988   -0.007124
2989   -0.027156
2990    0.021683
2991    0.036508
2992    0.020784
2993   -0.008160
2994   -0.012201
2995   -0.002270
2996   -0.016041
2997    0.0139

In [6]:
sr.mean()

-0.00036402674195635187

In [12]:
%tb
import unittest

class TestIC(unittest.TestCase):
    def setUp(self):
        pass

    def tearDown(self):
        pass

    def test_perfomance(self):
        dfa = pd.DataFrame(np.random.random_sample((3000,3000)))
        dfb = pd.DataFrame(np.random.random_sample((3000,3000)))
        
        start = time.time()
        icval = ic(dfa,dfb)
        end = time.time()
        print('\n perfomance in (3000,3000) use: '+str(end -start)+' s')
        self.assertLessEqual(end -start,1)

obj = unittest.main(argv=['ignored', '-v'], exit=False)

No traceback available to show.
test_perfomance (__main__.TestIC) ... 


 perfomance in (3000,3000) use: 0.793999910355 s


ok

----------------------------------------------------------------------
Ran 1 test in 1.092s

OK
