In [None]:
import requests
import os
import errno
import zipfile
import csv
import urllib
import glob
import pandas as pd

# Constants
USERNAME = 'cai.li1@husky.neu.edu'
PASSWORD = 'Hf9tN]tm'
START = 'Q12007'
END = 'Q22007'
DIR_NAME = "data/"
login_page_url = 'https://freddiemac.embs.com/FLoan/secure/auth.php'
download_page_url = 'https://freddiemac.embs.com/FLoan/Data/download2.php'
REMOVE_UNZIPPED_FILES = False
VERBOSE_MODE=True

FIELDS = 'ABCDEFGHIJKL0MNOPQRSTUVWX'

## Directory creation if doesn't exist
def create_directory(dir_name):
    os.getcwd()
    if not os.path.exists(dir_name):
        try:
            os.makedirs(dir_name)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

## gets the zipped files and extrcts the contents into unzipped folder
def get_data_from_url(quarter):
    print('downloading...')
    urllib.request.urlretrieve('https://freddiemac.embs.com/FLoan/Data/historical_data1_' + str(quarter) + '.zip',
                               DIR_NAME + str(quarter) + '.zip')
    # unzip_files(year)
    try:
        zip_ref = zipfile.ZipFile(DIR_NAME + str(quarter) + '.zip', 'r')
        zip_ref.extractall(DIR_NAME)
        zip_ref.close()
        
        write_into_csv(quarter)
    except zipfile.BadZipfile:
        print (zipfile.BadZipfile)

## Creates a consolidated file

## Removes files form the directory
def clean_directory(quarter):
    os.remove(DIR_NAME + str(quarter) + '.zip')
    os.remove(DIR_NAME + "historical_data1_" + str(quarter) + '.txt')
    os.remove(DIR_NAME + "historical_data1_time_" + str(quarter) + '.txt')

## Creates a consolidated file
def create_csv():
    
    fields = 'ABCDEFGHIJKL0MNOPQRSTUVWX'
    with open(DIR_NAME + '/joinedRawData.csv', 'w') as file:
        writer = csv.writer(file)
        writer.writerows([fields])
        file.close()
    return DIR_NAME + '/joinedRawData.csv'
    
# Writes the text file lines into csv
def write_into_csv(quarter):
    with open(DIR_NAME + "historical_data1_" + str(quarter) + '.txt', 'r') as sourceFile:
        sourcelines = sourceFile.read()
        sourcelines = sourcelines.replace(",", "_")
        sourcelines = sourcelines.replace("|", ",")
        sourcelines = sourcelines.replace(",\n", "\n")
        
#        with open(DIR_NAME + '/joinedRawData.csv', 'a') as destinationFile:
#            destinationFile.write(sourcelines)
        
        with open(DIR_NAME + str(quarter) +'.csv', 'w') as destinationFile:
            writer = csv.writer(destinationFile)
            writer.writerows([FIELDS])
            destinationFile.write(sourcelines)
    print('written to csv.')
    clean_directory(quarter)
    
def data_cleaning():
    cat_columns = ['C','G','H','M','N','O','P','Q','T','U','V','W','X']
    num_columns = ['A','B','D','E','F','I','J','K','L','R','0']
    
    fileList = fileList = glob.glob('data/*.csv')
    for file in fileList:
        df = pd.read_csv(file)
        print('categorical cleaning...')
        for col in cat_columns:
            mode = pd.DataFrame(df.groupby(col).size().rename('cnt')).idxmax()[0]
            df[col] = df[col].fillna(mode)
        print('numerical cleaning...')
        for col in ['E','F','I','J','K','L','R']:
            dfmean = df[(df[col] != 999)|(df[col] != None)]
            mean = int(dfmean[col].mean(axis=0))
            df[col] = df[col].fillna(mean)

        cat = pd.get_dummies(df[cat_columns])
        df = pd.concat([df[num_columns],cat],axis = 1).fillna(0)
        
        filename = "%sclean.csv"%file[:-4]
        df.to_csv(filename)
        
        print('cleaning finished.')
        
        
## Main Program Execution
def start_execution():
    with requests.Session() as sess:
        sess.get(login_page_url);
        php_session_cookie = sess.cookies['PHPSESSID']
        login_payload = {'username' : USERNAME, 'password' : PASSWORD,'cookie':php_session_cookie}
        sess.post(login_page_url, data = login_payload)
        download_page_payload = {'accept': 'Yes', 'action': 'acceptTandC', 'acceptSubmit': 'Continue', 'cookie': php_session_cookie}
        sess.post(download_page_url, data=download_page_payload)
        create_directory(DIR_NAME)
#        create_csv()
        
        get_data_from_url(START)
        get_data_from_url(END)
        #get_data_from_url('Q32007')
        #get_data_from_url('Q42007')
        #get_data_from_url('Q12008')
        
        data_cleaning()

## Calling all the main functions
if __name__ == "__main__":
    start_execution()
    print('finished.')

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt
from sklearn import preprocessing

df_train = pd.read_csv('data/Q12005clean.csv')
df_test = pd.read_csv('data/Q22005clean.csv')
df = pd.concat([df_train,df_test],axis = 0)

lab_enc = preprocessing.LabelEncoder()

X_train = df.head(df_train.shape[0])
y_train = df_train['0']
y_train_encoded = lab_enc.fit_transform(y_train)

X_test = df.tail(df_test.shape[0])
y_test = df_test['0']
y_test_encoded = lab_enc.fit_transform(y_test)

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def printPerformance(pred):
    print(pred)
    print("RMSE: %.2f"
          % sqrt(mean_squared_error(y_test, pred)))
    print("MAPE: %.2f"
          % mean_absolute_percentage_error(y_test, pred)+'%')
    print("MAE: %.2f"
          % mean_absolute_error(y_test, pred))
    
#Linear, random forestk, and neural network models
from sklearn.linear_model import LinearRegression

linear = LinearRegression()
linear.fit(X_train,y_train)
linearPredict = linear.predict(X_test)
printPerformance(linearPredict)

from sklearn.ensemble import RandomForestRegressor
randomForest = RandomForestRegressor(max_depth= 18, n_estimators = 16, random_state=2)
randomForest.fit(X_train,y_train)
randomForestPredict = randomForest.predict(X_test)
printPerformance(randomForestPredict)

from sklearn.neural_network import MLPRegressor
neuralNetwork = MLPRegressor()
neuralNetwork.fit(X_train,y_train)
neuralNetworkPredict = neuralNetwork.predict(X_test)
printPerformance(neuralNetworkPredict)

# Build RF classifier to use in feature selection
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

#Forward Selection
fs = sfs(clf,
        k_features=9,
        forward=False, 
        floating=False,
        n_jobs=-1,
        verbose=2,
        scoring='neg_mean_absolute_error',
        cv=10)
fs.fit(X_train,y_train)
print('Best MAE score: %.2f' % fs.best_score_ * (-1))
print('Best subset:', fs.best_feature_names_)

'''
#Backward Selection
bs = sfs(clf,
        k_features=9,
        forward=False, 
        floating=False,
        n_jobs=-1,
        verbose=2,
        scoring='neg_mean_absolute_error',
        cv=10)
bs.fit(X_train,y_train)

#Exhaustive Selection

from mlxtend.feature_selection import ExhaustiveFeatureSelector as efs
es = efs(clf, 
          min_features=8,
          max_features=11,
          scoring='neg_mean_absolute_error',
          n_jobs=-1,
          print_progress=True,
          cv=8)
es.fit(X_train,y_train_encoded)
print('Best MAE score: %.2f' % efs.best_score_ * (-1))
print('Best subset:', efs.best_feature_names_)
'''

#Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(linear, X_test, y_test, cv=3)
print(scores)
scores = cross_val_score(randomForest, X_test, y_test, cv=3)
print(scores)
scores = cross_val_score(neuralNetwork, X_test, y_test, cv=3)
print(scores)

In [None]:
#selector = SelectKBest(f_regression, k=20).fit(X_train,y_train)
#k_best_features = X_train.columns.values[selector.get_support()]
#
##tpot
#from tpot import TPOTRegressor
#
#pipeline_optimizer = TPOTRegressor(generations=5, population_size=20, cv=5,
#                                    random_state=42, verbosity=2)
#pipeline_optimizer.fit(X_train, y_train)
#print(pipeline_optimizer.score(X_test, y_test))
#pipeline_optimizer.export('tpot_exported_pipeline.py')

#Pipeline Exported
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the class is labeled '0' in the data file
tpot_data = df
features = tpot_data.drop('0', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['0'].values, random_state=42)

# Average CV score on the training set was:-6.254833533625527e-26
exported_pipeline = LassoLarsCV(normalize=True)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
printPerformance(results)

In [None]:
#Use downloaddata.py with proper settings to download the data.
df0 = pd.read_csv('data/Q12000clean.csv')
df1 = pd.read_csv('data/Q12001clean.csv')
df2 = pd.read_csv('data/Q12002clean.csv')
df3 = pd.read_csv('data/Q12003clean.csv')
df4 = pd.read_csv('data/Q12004clean.csv')
df5 = pd.read_csv('data/Q42005clean.csv')

df6 = pd.read_csv('data/Q12006clean.csv')
df7 = pd.read_csv('data/Q12007clean.csv')
df8 = pd.read_csv('data/Q12008clean.csv')
df9 = pd.read_csv('data/Q12009clean.csv')
df10 = pd.read_csv('data/Q12010clean.csv')
df11 = pd.read_csv('data/Q12011clean.csv')
df12 = pd.read_csv('data/Q12012clean.csv')

df72 = pd.read_csv('data/Q22007clean.csv')
df73 = pd.read_csv('data/Q32007clean.csv')
df74 = pd.read_csv('data/Q42007clean.csv')


#top features from select k best features.
k_best_features = ['D', 'F', 'I', 'K', 'L', 'U', 'C_Y', 'H_I', 'H_P', 'Q_MH', 'T_P',
 'W_BANKOFAMERICA_NA', 'W_COUNTRYWIDE', 'W_GMACMTGECORP', 'W_NATLCITYMTGECO',
 'W_WASHINGTONMUTUALBANK', 'X_BANKOFAMERICA_NA', 'X_COUNTRYWIDE',
 'X_GMACMORTGAGE_LLC', 'X_NATLCITYMTGECO']


In [None]:
#What if there is a financial crisis...

df_train = pd.concat([df7,df72,df73,df74],axis = 0).sample(frac = 0.1)
df_test = pd.concat([df8,df72,df73,df74],axis = 0).sample(frac = 0.1)

#Pipline random forest
from sklearn.ensemble import RandomForestRegressor
randomForest = RandomForestRegressor(max_depth= 18, n_estimators = 16, random_state=2)

X_train = df_train[k_best_features]
y_train = df_train['0']
X_test = df_test[k_best_features]
y_test = df_test['0']

randomForest.fit(X_train,y_train)
randomForest.predict(X_test)

#Two Years Later..
X_test = df9[k_best_features]
y_test = df9['0']
randomForest.fit(X_train,y_train)
printPerformance(randomForest.predict(X_test))

In [None]:
#What if there is a economy boom...

df_train = pd.concat([df0,df2,df4,df6,df8,df10,df12],axis = 0).sample(frac = 0.1)

X_train = df_train[k_best_features].head(shape)
y_train = df_train['0'].tail(df_test.shape)
X_test = df12
randomForest.fit(X_train,y_train)
printPerformance(randomForest.predict(X_test))

In [None]:
#What if there is a regime change from election

X_train = df6[k_best_features].head(shape)
y_train = df6['0'].tail(df_test.shape)
X_test = df12[k_best_features]
y_test = df12['0']

randomForest.fit(X_train,y_train)
printPerformance(randomForest.predict(X_test))