In [26]:
#train list is [ageEnteringOffice, days]
def dfToLabels(df):
    trainList = []
    answerLabelList = []
    for _,row in df.iterrows():
        paramsList = []
        paramsList.append(row.ageEnteringOffice)
        paramsList.append(row.Days)
        trainList.append(paramsList)
        answerLabelList.append(row.percentAsForeign)
    return trainList,answerLabelList

In [57]:
def makePlot(reg,trainList,trainAnswers,testLabels,testAnswers,title):
    fidelity = 30
    dateLinSpace = np.linspace(200,3500,fidelity)
    approvalLinSpace = np.linspace(.5,1,fidelity)
    ageEnteringOfficeLinSpace = np.linspace(10000,30000,fidelity)
    x,y = np.meshgrid(ageEnteringOfficeLinSpace,dateLinSpace)
    z = np.zeros((fidelity,fidelity))
    for rowIndex in range(len(x)):
        for valIndex in range(len(x[rowIndex])):
            z[rowIndex][valIndex] = reg.predict([[x[rowIndex][valIndex],y[rowIndex][valIndex]]])

    fig = plt.figure()
    ax = fig.gca(projection = '3d')
    ax.plot_wireframe(x,y,z)
    ax.set_xticks(np.arange(5000,30000,5000))
    ax.set_xlabel('Age when taking position (Days)')
    ax.set_ylabel('Days in position')
    ax.scatter([approvalRate[0] for approvalRate in testLabels],
               [days[1] for days in testLabels],
               testAnswers, c = 'yellow')
    ax.scatter([approvalRate[0] for approvalRate in trainList],
               [days[1] for days in trainList],
               trainAnswers, c = 'purple')
    yellow_patch = mpatches.Patch(color='yellow', label='Test Data')
    purple_patch = mpatches.Patch(color='purple', label='Training data')
    blue_patch = mpatches.Patch(color = 'blue', label = 'Predicted')
    plt.legend(handles=[yellow_patch,purple_patch,blue_patch])
    plt.title(title)
    plt.show()

In [42]:
def trainModel(df, useAll):
    trainingDf = df[df.index % 4 != 0]
    testDf = df[df.index % 4 == 0]
    trainList, trainAnswers = 0,0
    testLabels, testAnswer = 0,0
    if useAll:
        trainList, trainAnswers = dfToAllLabels(trainingDf)
        testLabels, testAnswers = dfToAllLabels(testDf)
    else:
        trainList, trainAnswers = dfToLabels(trainingDf)
        testLabels, testAnswers = dfToLabels(testDf)

    reg = linear_model.LinearRegression()
    reg.fit(trainList,trainAnswers)

    prediction = reg.predict(testLabels)
    print('Mean squared error = ', mean_squared_error(prediction,testAnswers))
    return reg,trainList,trainAnswers,testLabels,testAnswers

In [43]:
def dfToAllLabels(df):
    trainList = []
    answerLabelList = []
    for _,row in df.iterrows():
        paramsList = []
        paramsList.append(row.ageEnteringOffice)
        paramsList.append(row.Days)
        paramsList.append(row.percentApproval)
        trainList.append(paramsList)
        answerLabelList.append(row.percentAsForeign)
    return trainList,answerLabelList

In [58]:
import numpy as np 
import math
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
from mpl_toolkits.mplot3d import axes3d
%matplotlib widget
plt.rcParams['figure.figsize'] = [8, 8]


fileName = 'cabinet_member_spending_with_age.csv'
df = pd.read_csv(fileName,header=0)
percentAsForeign = []
percentApproval = []
for index, row in df.iterrows():
    percentAsForeign.append(row[7]/(row[6] + row[7]))
    if  math.isnan(row[3]):
        percentApproval.append(.9)
    else :
        percentApproval.append(row[3]/100)
df['percentAsForeign'] = percentAsForeign
df['percentApproval'] = percentApproval

overallRegression,trainList,trainAnswers,testLabels,testAnswers = trainModel(df,False)
makePlot(overallRegression,trainList,trainAnswers,testLabels,testAnswers,
         "Regression based on days in office and \napproval rate")


demDF = df[df['President'] == 'Barack Obama']
repDF = df[df['President'] != 'Barack Obama']
demDF.reset_index(drop=True, inplace=True)
repDF.reset_index(drop=True, inplace=True)
demReg,trainList,trainAnswers,testLabels,testAnswers = trainModel(demDF,False)
makePlot(demReg,trainList,trainAnswers,testLabels,testAnswers,
         "Regression based on days in office and \n age for democrats")
repReg,trainList,trainAnswers,testLabels,testAnswers = trainModel(repDF,False)
makePlot(repReg,trainList,trainAnswers,testLabels,testAnswers,
         "Regression based on days in office and \nage for republicans")

print("Using 3 labels, not accounting for party:")
allLabelReg,trainList,trainAnswers,testLabels,testAnswers = trainModel(df,True)


Mean squared error =  0.00046807943789162134


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Mean squared error =  6.225951975361795e-05


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Mean squared error =  0.0001348810419322613


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Using 3 labels, not accounting for party:
Mean squared error =  0.0004686892259522336
