In [None]:
# This code is prepared by Orhan Erdem
# Please email orhanerdem at gmail.com for errors, suggestions

In [None]:
import math
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pylab as plt

import dmba
from dmba import regressionSummary, classificationSummary, liftChart, gainsChart

%matplotlib inline

In [None]:
# The following data set is obtained from https://data.boston.gov/dataset/property-assessment
# and slightly structured and cleaned.

In [None]:
#load data
asses_df=pd.read_excel('2023_Roxbury.xlsx')

In [None]:
asses_df.head()

In [None]:
asses_df.describe().round()

In [None]:
asses_df=asses_df.dropna()

In [None]:
asses_df.describe()

In [None]:
#create a list of predicto variables
excludeColumns=('TOTAL_VALUE','LU','HEAT_TYPE','PROP_VIEW')
predictors=[s for s in asses_df.columns if s not in excludeColumns]
outcome='TOTAL_VALUE'

X=asses_df[predictors]
y=asses_df[outcome]

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

# train linear regression model
reg = LinearRegression()
reg.fit(train_X, train_y)

# evaluate performance
# training
regressionSummary(train_y, reg.predict(train_X))
# validation
regressionSummary(valid_y, reg.predict(valid_X))


# Default Data Set (Class Prediction)

In [None]:
df=pd.read_excel('Default.xlsx')
df.head()

In [None]:
cutoffs=[i*0.1 for i in range(0,11)]
accT=[]
for cutoff in cutoffs:
    predicted=[1 if p>cutoff else 0 for p in df.prob]
    accT.append(accuracy_score(df.actual,predicted))

line_accuracy=plt.plot(cutoffs,accT,'-', label='Accuracy')[0]
line_error=plt.plot(cutoffs,[1-acc for acc in accT],'--', label='Overall error')[0]
plt.ylim([0,1])
plt.xlabel('Cutoff Value')
plt.legend(handles=[line_accuracy, line_error])

plt.show()


In [None]:
fpr,tpr,_=roc_curve(df.actual,df.prob)
roc_auc=auc(fpr,tpr)

In [None]:
plt.figure(figsize=[5, 5])
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.legend(loc="lower right")

plt.show()
# This code is prepared by Orhan Erdem