In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
#from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVR
from sklearn.svm import SVC
import operator
from sklearn.metrics import roc_auc_score
import pandas.io.data
from sklearn.qda import QDA
import datetime

In [153]:
out, nasdaq, djia, frankfurt, london, paris, hkong, nikkei, australia]
    

def count_missing(dataframe):
    """
    count number of NaN in dataframe
    """
    return (dataframe.shape[0] * dataframe.shape[1]) - dataframe.count().sum()

    
def addFeatures(dataframe, adjclose, returns, n):
    """
    operates on two columns of dataframe:
    - n >= 2
    - given Return_* computes the return of day i respect to day i-n. 
    - given AdjClose_* computes its moving average on n days
    """
    
    return_n = adjclose[9:] + "Time" + str(n)
    dataframe[return_n] = dataframe[adjclose].pct_change(n)
    
    roll_n = returns[7:] + "RolMean" + str(n)
    dataframe[roll_n] = pd.rolling_mean(dataframe[returns], n)
    
def mergeDataframes(datasets, index, target):
    """
    merges datasets in the list 
    """
    subset = []
    subset = [dataset.iloc[:, index:] for dataset in datasets[1:]]
    
    if target == 'CLASSIFICATION':    
        return datasets[0].iloc[:, index:].join(subset, how = 'outer')
    #elif target == 'REGRESSION':
    #    return datasets[0].iloc[:, index:].join(subset, how = 'outer')          
        
# def mergeDataframes(datasets, index, cut):
#     """
#     merges datasets in the list 
#     """
#     subset = []
#     subset = [dataset.iloc[:, index:] for dataset in datasets[1:]]
    
#     first = subset[0].merge(subset[1:], how = 'outer')
#     finance = datasets[0].iloc[:, index:].merge(first, how = 'left') 
#     finance = finance[finance.index > cut]
#     return finance

def applyTimeLag(dataset, lags, delta, back, target):
    """
    apply time lag to return columns selected according  to delta.
    Days to lag are contained in the lads list passed as argument.
    Returns a NaN free dataset obtained cutting the lagged dataset
    at head and tail
    """
    
    if target == 'CLASSIFICATION':
        maxLag = max(lags)

        columns = dataset.columns[::(2*max(delta)-1)]
        for column in columns:
            for lag in lags:
                newcolumn = column + str(lag)
                dataset[newcolumn] = dataset[column].shift(lag)

        return dataset.iloc[maxLag:-1,:]
#    elif target == 'REGRESSION':
#        maxLag = max(lags)
#        
#        columns = dataset.columns[::(2*max(delta)-1)]
#        for column in columns:
#            for lag in lags:
#                newcolumn = column + str(lag)
#                dataset[newcolumn] = dataset[column].shift(lag)
#
#        return dataset.iloc[maxLag:-1,:]       


def performCV(X_train, y_train, folds, method, parameters):
    """
    given complete dataframe, number of folds, the % split to generate 
    train and test set and features to perform prediction --> splits
    dataframein test and train set. Takes train set and splits in k folds.
    - Train on fold 1, test on 2
    - Train on fold 1-2, test on 3
    - Train on fold 1-2-3, test on 4
    ....
    returns mean of test accuracies
    """
    print ''
    print 'Parameters --------------------------------> ', parameters
    print 'Size train set: ', X_train.shape
    k = int(np.floor(float(X_train.shape[0])/folds))
    print 'Size of each fold: ', k
    acc = np.zeros(folds-1)
    for i in range(2, folds+1):
        print ''
        split = float(i-1)/i
        print 'Splitting the first ' + str(i) + ' chuncks at ' + str(i-1) + '/' + str(i) 
        data = X_train[:(k*i)]
        output = y_train[:(k*i)]
        print 'Size of train+test: ', data.shape
        index = int(np.floor(data.shape[0]*split))
        X_tr = data[:index]        
        y_tr = output[:index]
        
        X_te = data[(index+1):]
        y_te = output[(index+1):]        
        
        acc[i-2] = performClassification(X_tr, y_tr, X_te, y_te, method, parameters)
        print 'Accuracy on fold ' + str(i) + ': ', acc[i-2]
    return acc.mean()   

def performTimeSeriesSearchGrid(X_train, y_train, folds, method, grid):
    """
    parameters is a dictionary with: keys --> parameter , values --> list of values of parameter
    """
    print ''
    print 'Performing Search Grid CV...'
    print 'Algorithm: ', method
    param = grid.keys()
    finalGrid = {}
    if len(param) == 1:
        for value_0 in grid[param[0]]:
            parameters = [value_0]
            accuracy = performCV(dataset, folds, split, features, method, parameters)
            finalGrid[accuracy] = parameters
        final = sorted(finalGrid.iteritems(), key=operator.itemgetter(0), reverse=True)  
        print ''
        print 'Final CV Results: ', final        
        return final[0]
        
    elif len(param) == 2:
        for value_0 in grid[param[0]]:
            for value_1 in grid[param[1]]:
                parameters = [value_0, value_1]
                accuracy = performCV(dataset, folds, split, features, method, parameters)
                finalGrid[accuracy] = parameters
        final = sorted(finalGrid.iteritems(), key=operator.itemgetter(0), reverse=True)
        print ''
        print 'Final CV Results: ', final
        return final[0]


##################
################## MERGING SENTIMENT

def mergeSentimenToStocks(stocks):
    df = pd.read_csv('/home/francesco/BigData/Project/CSV/sentiment.csv', index_col = 'date')
    final = stocks.join(df, how='left')
    return final
       
        
###############################################################################    
###############################################################################    
###############################################################################
######## CLASSIFICATION    
    
#####IDEAS --> MULTIPLYEACH RETURN BY 100, QDA, AUC
#####    
    
def prepareDataForClassification(dataset, start_test):
    """
    generates categorical to be predicted column, attach to dataframe 
    and label the categories
    """
    le = preprocessing.LabelEncoder()
    
    dataset['UpDown'] = dataset['Return_Out']
    dataset.UpDown[dataset.UpDown >= 0] = 'Up'
    dataset.UpDown[dataset.UpDown < 0] = 'Down'
    dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
    
    features = dataset.columns[1:-1]
    X = dataset[features]    
    y = dataset.UpDown    
    
    X_train = X[X.index < start_test]
    y_train = y[y.index < start_test]    
    
    X_test = X[X.index >= start_test]    
    y_test = y[y.index >= start_test]
    
    return X_train, y_train, X_test, y_test    

def prepareDataForModelSelection(X_train, y_train, start_validation):
    """
    gets train set and generates a validation set splitting the train.
    The validation set is mandatory for feature and model selection.
    """
    X = X_train[X_train.index < start_validation]
    y = y_train[y_train.index < start_validation]    
    
    X_val = X_train[X_train.index >= start_validation]    
    y_val = y_train[y_train.index >= start_validation]   
    
    return X, y, X_val, y_val
    
  
def performClassification(X_train, y_train, X_test, y_test, method, parameters):
    """
    performs classification on returns using serveral algorithms
    """
    #print ''
    print 'Performing ' + method + ' Classification...'    
    print 'Size of train set: ', X_train.shape
    print 'Size of test set: ', X_test.shape
   
    if method == 'RF':   
        return performRFClass(X_train, y_train, X_test, y_test)
        
    elif method == 'KNN':
        return performKNNClass(X_train, y_train, X_test, y_test)
    
    elif method == 'SVM':   
        return performSVMClass(X_train, y_train, X_test, y_test)
    
    elif method == 'ADA':
        return performAdaBoostClass(X_train, y_train, X_test, y_test, parameters)
    
    elif method == 'GTB': 
        return performGTBClass(X_train, y_train, X_test, y_test)

    elif method == 'QDA': 
        return performQDAClass(X_train, y_train, X_test, y_test)
    
def performRFClass(X_train, y_train, X_test, y_test):
    """
    Random Forest Binary Classification
    """
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    #auc = roc_auc_score(y_test, clf.predict(X_test))
    return accuracy
        
def performKNNClass(X_train, y_train, X_test, y_test):
    """
    KNN binary Classification
    """
    clf = neighbors.KNeighborsClassifier()
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    #auc = roc_auc_score(y_test, clf.predict(X_test))
    return accuracy

def performSVMClass(X_train, y_train, X_test, y_test):
    """
    SVM binary Classification
    """
    clf = SVC()
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    #auc = roc_auc_score(y_test, clf.predict(X_test))
    return accuracy
    
def performAdaBoostClass(X_train, y_train, X_test, y_test, parameters):
    """
    Ada Boosting binary Classification
    """
    n = parameters[0]
    l =  parameters[1]
    clf = AdaBoostClassifier(n_estimators = n, learning_rate = l)
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    #auc = roc_auc_score(y_test, clf.predict(X_test))
    return accuracy
    
def performGTBClass(X_train, y_train, X_test, y_test):
    """
    Gradient Tree Boosting binary Classification
    """
    clf = GradientBoostingClassifier(n_estimators=100)
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    #auc = roc_auc_score(y_test, clf.predict(X_test))
    return accuracy

def performQDAClass(X_train, y_train, X_test, y_test):
    """
    Gradient Tree Boosting binary Classification
    """
    clf = QDA()
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    #auc = roc_auc_score(y_test, clf.predict(X_test))
    return accuracy









##############################################################################
##############################################################################
##############################################################################   
##############################################################################
####### REGRESSION
    
def performRegression(dataset, split):
    """
    performs regression on returns using serveral algorithms
    """

    features = dataset.columns[1:]
    index = int(np.floor(dataset.shape[0]*split))
    train, test = dataset[:index], dataset[index:]
    print 'Size of train set: ', train.shape
    print 'Size of test set: ', test.shape
    
    output = 'Return_SP500'

    #print 'Accuracy RFC: ', performRFReg(train, test, features, output)
   
    #print 'Accuracy SVM: ', performSVMReg(train, test, features, output)
   
    #print 'Accuracy BAG: ', performBaggingReg(train, test, features, output)
   
    #print 'Accuracy ADA: ', performAdaBoostReg(train, test, features, output)
   
    #print 'Accuracy BOO: ', performGradBoostReg(train, test, features, output)

    print 'Accuracy KNN: ', performKNNReg(train, test, features, output)


def performRFReg(train, test, features, output):
    """
    Random Forest Regression
    """

    forest = RandomForestRegressor(n_estimators=100, n_jobs=-1)
    forest = forest.fit(train[features], train[output])
    Predicted = forest.predict(test[features])
    

    plt.plot(test[output])
    plt.plot(Predicted, color='red')
    plt.show()        
    
    return mean_squared_error(test[output], Predicted), r2_score(test[output], Predicted)

def performSVMReg(train, test, features, output):
    """
    SVM Regression
    """

    clf = SVR()
    clf.fit(train[features], train[output])
    Predicted = clf.predict(test[features])
    
    plt.plot(test[output])
    plt.plot(Predicted, color='red')
    plt.show()        
    
    return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)
    
def performBaggingReg(train, test, features, output):
    """
    Bagging Regression
    """
  
    clf = BaggingRegressor()
    clf.fit(train[features], train[output])
    Predicted = clf.predict(test[features])
    
    plt.plot(test[output])
    plt.plot(Predicted, color='red')
    plt.show()        
    
    return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)  

def performAdaBoostReg(train, test, features, output):
    """
    Ada Boost Regression
    """

    clf = AdaBoostRegressor()
    clf.fit(train[features], train[output])
    Predicted = clf.predict(test[features])
    
    plt.plot(test[output])
    plt.plot(Predicted, color='red')
    plt.show()        
    
    return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)

def performGradBoostReg(train, test, features, output):
    """
    Gradient Boosting Regression
    """
    
    clf = GradientBoostingRegressor()
    clf.fit(test[features], train[output])
    Predicted = clf.predict(test[features])
    
    plt.plot(test[output])
    plt.plot(Predicted, color='red')
    plt.show()    
    
    return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)

def performKNNReg(train, test, features, output):
    """
    KNN Regression
    """

    clf = KNeighborsRegressor()
    clf.fit(train[features], train[output])
    Predicted = clf.predict(test[features])
    
    plt.plot(test[output])
    plt.plot(Predicted, color='red')
    plt.show()        
    
    return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)



In [160]:
# def getStock(symbol, start, end):
#     """
#     downloads stock which is gonna be the output of prediciton
#     """
#     out =  pd.io.data.DataReader(symbol, start, end)
#     out = out.to_frame()
#     out.columns.values[-1] = 'AdjClose'
#     out.columns = out.columns + '_Out'
#     out['Return_Out'] = out['AdjClose_Out'].pct_change()
#     return out

def getStock(symbol, start, end):
    """
    downloads stock which is gonna be the output of prediciton
    """
    out =  pd.io.data.get_data_yahoo(symbol, start, end)

    out.columns.values[-1] = 'AdjClose'
    out.columns = out.columns + '_Out'
    out['Return_Out'] = out['AdjClose_Out'].pct_change()
    return out

def loadDatasets(path_directory): 
    """
    import into dataframe all datasets saved in path_directory
    """
    #name = path_directory + '/procter.csv'
    #out = pd.read_csv(name, index_col=0, parse_dates=True)
    
    name = path_directory + '/sp.csv'
    sp = pd.read_csv(name, index_col=0, parse_dates=True)
    
    name = path_directory + '/nasdaq.csv'
    nasdaq = pd.read_csv(name, index_col=0, parse_dates=True)
    
    name = path_directory + '/djia.csv'
    djia = pd.read_csv(name, index_col=0, parse_dates=True)
    
    name = path_directory + '/treasury.csv'
    treasury = pd.read_csv(name, index_col=0, parse_dates=True)
    
    name = path_directory + '/hkong.csv'
    hkong = pd.read_csv(name, index_col=0, parse_dates=True)
    
    name = path_directory + '/frankfurt.csv'
    frankfurt = pd.read_csv(name, index_col=0, parse_dates=True)
    
    name = path_directory + '/paris.csv'
    paris = pd.read_csv(name, index_col=0, parse_dates=True)
    
    name = path_directory + '/nikkei.csv'
    nikkei = pd.read_csv(name, index_col=0, parse_dates=True)
    
    name = path_directory + '/london.csv'
    london = pd.read_csv(name, index_col=0, parse_dates=True)
    
    name = path_directory + '/australia.csv'
    australia = pd.read_csv(name, index_col=0, parse_dates=True)
    
    return [sp, nasdaq, djia, treasury, hkong, frankfurt, paris, nikkei, london, australia]

def applyRollMeanDelayedReturns(datasets, delta):
    for dataset in datasets:
        columns = dataset.columns    
        adjclose = columns[-2]
        returns = columns[-1]
        for n in delta:
            addFeatures(dataset, adjclose, returns, n)
        

def performFeatureSelection(maxdeltas, maxlags, cut, start_test, path_datasets, method, folds, parameters):
    """
    Performs Feature selection for a specific algorithm
    """
    accuracies = []
    for maxlag in range(3, maxlags + 2):
        lags = range(2, maxlag) 
        print ''
        print '============================================================='
        print 'Maximum time lag applied', max(lags)
        print ''
        for maxdelta in range(3, maxdeltas + 2):
            datasets = loadDatasets(path_datasets)
            delta = range(2, maxdelta) 
            print 'Delta days accounted: ', max(delta)
            
            start = datetime.datetime(1993, 1, 1)
            end = datetime.datetime(2015, 8, 31)
            out = getStock('GE', start, end)
            datasets.insert(0, out)  
            applyRollMeanDelayedReturns(datasets, delta)
            
            #finance = mergeDataframes(datasets, 6, cut)
            finance = mergeDataframes(datasets, 6, "CLASSIFICATION")
            print 'Size of data frame: ', finance.shape
            print 'Number of NaN after merging: ', count_missing(finance)
            finance = finance.interpolate(method='linear')
            print 'Number of NaN after time interpolation: ', count_missing(finance)
            finance = finance.fillna(finance.mean())
            print 'Number of NaN after mean interpolation: ', count_missing(finance)    
            
            back = -1
            finance.Return_Out = finance.Return_Out.shift(back)
#TIME LAG???  
            finance = applyTimeLag(finance, lags, delta, back, target)

            #finance = applyTimeLag(finance, lags, delta)
            print 'Number of NaN after temporal shifting: ', count_missing(finance)
            print 'Size of data frame after feature creation: ', finance.shape
            X_train, y_train, X_test, y_test  = prepareDataForClassification(finance, start_test)
            accuracy = performCV(X_train, y_train, folds, method, parameters)
            print accuracy
            accuracies.append((lags,delta,accuracy))
            print ''
    return accuracies

In [187]:
#Setup and load datasets
target = 'CLASSIFICATION'
lags = range(2, 11)
print 'Maximum time lag applied', max(lags)
start = datetime.datetime(1993, 1, 1)
end = datetime.datetime(2015, 8, 31)
out = getStock('GE', start, end)
datasets = loadDatasets('/Users/Kevin/Desktop/StocksProject/datasets')

#Insert GE stock as first dataset
datasets.insert(0, out)    
delta = range(2, 11)
print 'Max Delta days accounted: ', max(delta)
    
#Add features to each stock, number of features added depends on deltas
for dataset in datasets:
    columns = dataset.columns    
    adjclose = columns[-2]
    returns = columns[-1]
    for n in delta:
        addFeatures(dataset, adjclose, returns, n)

#Merge our augmented dataframes
finance = mergeDataframes(datasets, 6, target)

#Sanity check
print 'Size of data frame: ', finance.shape
print 'Number of NaN after merging: ', count_missing(finance)
print '% of NaN after merging: ', (count_missing(finance)/float(finance.shape[0]*finance.shape[1]))*100, '%'

#Interpolate is built-in function that replaces missing values(i.e. nan) with estimates. It is "time aware", so it
#looks at the dates(i.e. index) to determine what value to fill in
finance = finance.interpolate(method = 'time')
print 'Number of NaN after time interpolation: ', count_missing(finance)

#Still some nan left, fill with mean
finance = finance.fillna(finance.mean())
print 'Number of NaN after mean interpolation: ', count_missing(finance)    

#Don't understand why SHIFT????
back = -1
#finance.Return_Out = finance.Return_Out.shift(back)
#TIME LAG???  
#finance = applyTimeLag(finance, lags, delta, back, target)
#finance = functions.mergeSentimenToStocks(finance)

print 'Number of NaN after temporal shifting: ', count_missing(finance)
print 'Size of data frame after feature creation: ', finance.shape

#Create train, test set. Split at start_test
if target == 'CLASSIFICATION':
    start_test = datetime.datetime(2014,4,1)
    X_train, y_train, X_test, y_test  = prepareDataForClassification(finance, start_test)
    
#Original classifier has 40% accuracy, take the opposite of every guess to get 60% accuracy
print performClassification(X_train, np.random.choice(2,len(y_train),p=[.65,.35]), X_test, y_test, "RF", [])
print performClassification(X_train, y_train, X_test, y_test, "RF", [])
#[0 if y==1 else 1 for y in y_test]

Maximum time lag applied 10
Max Delta days accounted:  10
Size of data frame:  (6209, 209)
Number of NaN after merging:  96446
% of NaN after merging:  7.43218094432 %
Number of NaN after time interpolation:  49636
Number of NaN after mean interpolation:  0
Number of NaN after temporal shifting:  0
Size of data frame after feature creation:  (6209, 209)
Performing RF Classification...
Size of train set:  (5796, 208)
Size of test set:  (413, 208)
0.530266343826
Performing RF Classification...
Size of train set:  (5796, 208)
Size of test set:  (413, 208)
0.755447941889


A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [182]:
#np.random.choice(2,len(y_train)p=[.5,.5])
print performCV(X_train, y_train, 10, "RF", [])


Parameters -------------------------------->  []
Size train set:  (5786, 307)
Size of each fold:  578

Splitting the first 2 chuncks at 1/2
Size of train+test:  (1156, 307)
Performing RF Classification...
Size of train set:  (578, 307)
Size of test set:  (577, 307)
Accuracy on fold 2:  0.540727902946

Splitting the first 3 chuncks at 2/3
Size of train+test:  (1734, 307)
Performing RF Classification...
Size of train set:  (1156, 307)
Size of test set:  (577, 307)
Accuracy on fold 3:  0.538994800693

Splitting the first 4 chuncks at 3/4
Size of train+test:  (2312, 307)
Performing RF Classification...
Size of train set:  (1734, 307)
Size of test set:  (577, 307)
Accuracy on fold 4:  0.500866551127

Splitting the first 5 chuncks at 4/5
Size of train+test:  (2890, 307)
Performing RF Classification...
Size of train set:  (2312, 307)
Size of test set:  (577, 307)
Accuracy on fold 5:  0.46273830156

Splitting the first 6 chuncks at 5/6
Size of train+test:  (3468, 307)
Performing RF Classifica

In [174]:
out.head()

Unnamed: 0_level_0,Open_Out,High_Out,Low_Out,Close_Out,Volume_Out,AdjClose_Out,Return_Out,OutTime2,OutRolMean2,OutTime3,...,OutTime6,OutRolMean6,OutTime7,OutRolMean7,OutTime8,OutRolMean8,OutTime9,OutRolMean9,OutTime10,OutRolMean10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-01-04,85.625038,86.124962,85.250038,85.5,9048000,3.78224,,,,,...,,,,,,,,,,
1993-01-05,85.374962,86.874962,85.250038,86.499962,13648800,3.826475,0.011695,,,,...,,,,,,,,,,
1993-01-06,86.000038,86.625,85.749962,86.124962,12327600,3.809886,-0.004335,0.007309,0.00368,,...,,,,,,,,,,
1993-01-07,86.375038,87.249962,85.5,85.875,15777600,3.798829,-0.002902,-0.007225,-0.003619,0.004386,...,,,,,,,,,,
1993-01-08,85.875,86.25,84.875038,85.749962,14679600,3.793297,-0.001456,-0.004354,-0.002179,-0.008671,...,,,,,,,,,,


In [159]:
path='/Users/Kevin/Desktop/StocksProject/datasets'

#Where to split train,test
start_test=datetime.datetime(2014,4,1)

#Only look at data after cut
cut = datetime.datetime(1993,1,1)

method="RF"
folds = 10
parameters = []
performFeatureSelection(9, 9, cut, start_test, path, method, folds, parameters)


Maximum time lag applied 2

Delta days accounted:  2
Size of data frame:  (6209, 33)
Number of NaN after merging:  15094
Number of NaN after time interpolation:  7702
Number of NaN after mean interpolation:  0
Number of NaN after temporal shifting:  0
Size of data frame after feature creation:  (6206, 44)

Parameters -------------------------------->  []
Size train set:  (5794, 43)
Size of each fold:  579

Splitting the first 2 chuncks at 1/2
Size of train+test:  (1158, 43)
Performing RF Classification...
Size of train set:  (579, 43)
Size of test set:  (578, 43)
Accuracy on fold 2:  0.532871972318

Splitting the first 3 chuncks at 2/3
Size of train+test:  (1737, 43)
Performing RF Classification...
Size of train set:  (1158, 43)
Size of test set:  (578, 43)
Accuracy on fold 3:  0.522491349481

Splitting the first 4 chuncks at 3/4
Size of train+test:  (2316, 43)
Performing RF Classification...
Size of train set:  (1737, 43)
Size of test set:  (578, 43)
Accuracy on fold 4:  0.5346020761

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyboardInterrupt: 

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 30 19:29:03 2014
@author: francesco
"""

#import pandas as pd
#import matplotlib.pyplot as plt
#import numpy as np
#from Project import loadDatasets



target = 'CLASSIFICATION'
#target = 'REGRESSION'

lags = range(2,3)
print 'Maximum time lag applied', max(lags)
print ''

for maxdelta in range(3,12):
    
    datasets = loadDatasets('/home/francesco/Dropbox/DSR/Project/datasets')

    delta = range(2,maxdelta)
    print 'Delta days accounted: ', max(delta)
    
    for dataset in datasets:
        columns = dataset.columns    
        adjclose = columns[-2]
        returns = columns[-1]
        for n in delta:
            addFeatures(dataset, adjclose, returns, n)
        dataset = dataset.iloc[max(delta):,:] # computation of returns and moving means introduces NaN which are nor removed
    
    finance = mergeDataframes(datasets, 6)
    
    print 'Size of data frame: ', finance.shape
    print 'Number of NaN after merging: ', count_missing(finance)
    
    finance = finance.interpolate(method='time')
    print 'Number of NaN after time interpolation: ', finance.shape[0]*finance.shape[1] - finance.count().sum()

    finance = finance.fillna(finance.mean())
    print 'Number of NaN after mean interpolation: ', (finance.shape[0]*finance.shape[1] - finance.count().sum())    

    back = -1
    finance.Return_SP500 = finance.Return_SP500.shift(back)
    
    finance = applyTimeLag(finance, lags, delta, back)
    
    print 'Number of NaN after temporal shifting: ', count_missing(finance)
    
    print 'Size of data frame after feature creation: ', finance.shape   
    
    if target == 'CLASSIFICATION':
        performClassification(finance, 0.8)
        print ''
    
    elif target == 'REGRESSION':
        performRegression(finance, 0.8)
        print ''