In [51]:
import numpy as np
import pandas as pd
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn import datasets

Download and read the "Real estate valuation data set.xlsx" dataset from the UCI repository. You can find the description of the features and targets on the UCI repository website. Split the dataset in train and test set (use your choice of splitting). Train a linear regression model and report the performance (use your choice of at least four performance metrics)

In [47]:
def splitFeature(data, splitter):
    y = data[splitter]
    x = data.drop(columns=splitter)
    return y, x

data = pd.read_csv('realEstateData.csv', header=0)
dataFrame = pd.DataFrame(data)

for i in dataFrame.columns:
    dataFrame[i] = (dataFrame[i] - dataFrame[i].min()) / (dataFrame[i].max() - dataFrame[i].min())

training = dataFrame.sample(frac = 0.7)
test = dataFrame.drop(training.index)

yTrain, xTrain = splitFeature(training, 'Y house price of unit area')
yTest, xTest = splitFeature(test, 'Y house price of unit area')

linReg = LinearRegression()
linReg.fit(xTrain, yTrain)

yPred = linReg.predict(xTest)

print('mean squared error is '+str(mean_squared_error(yTest.values, yPred)))
print('max error is '+str(max_error(yTest.values, yPred)))
print('mean absolute error is '+str(mean_absolute_error(yTest.values, yPred)))
print('r2 score is '+str(r2_score(yTest.values, yPred)))

mean squared error is 0.008733803425883306
max error is 0.6996312786186174
mean absolute error is 0.05949888476743391
r2 score is 0.44706619470583675


Apply PCA on the dataset and select the first three principal components. Split the dataset into train and test using the same method used in Q1. Compare the performance of this model with the performance obtained in Q1.  Explain the outcome.

In [48]:
pca = PCA(n_components=3)
pca.fit(dataFrame.values)
pcaData = pca.transform(dataFrame.values)

pcaDataFrame = pd.DataFrame(pcaData)

training = pcaDataFrame.sample(frac = 0.7)
test = pcaDataFrame.drop(training.index)

yTrain, xTrain = splitFeature(test, 2)
yTest, xTest = splitFeature(test, 2)

linReg = LinearRegression()
linReg.fit(xTrain, yTrain)

yPred = linReg.predict(xTest)

print('mean squared error is '+str(mean_squared_error(yTest.values, yPred)))
print('max error is '+str(max_error(yTest.values, yPred)))
print('mean absolute error is '+str(mean_absolute_error(yTest.values, yPred)))
print('r2 score is '+str(r2_score(yTest.values, yPred)))

mean squared error is 0.08237485083505582
max error is 0.6770526006778493
mean absolute error is 0.24050314967659575
r2 score is 0.0031045996620352456


Load "IRIS " datasets from SKlearn and follow this link for the data description (features and target variable). Apply PCA on the dataset and select the first three principal components. Split the dataset in train and test set (use your choice of splitting). Train a logistic regression model and report the performance (use your choice of at least 4 performance metric).

In [111]:
iris = datasets.load_iris()
iris.data = pd.DataFrame(iris.data)
iris.target = pd.DataFrame(iris.target)
irisPCA = pd.DataFrame(PCA(n_components=3).fit_transform(iris.data))

xTrain = irisPCA.sample(frac = 0.7)
yTrain = iris.target.loc[xTrain.index]

xTest = irisPCA.drop(xTrain.index)
true = iris.target.loc[test.index]

logReg = LogisticRegression(penalty = 'none')
logReg.fit(xTrain, np.ravel(yTrain))

yPred = logReg.predict(xTest)

print('mean squared error is '+str(mean_squared_error(true[0].values, yPred)))
print('max error is '+str(max_error(true[0].values, yPred)))
print('accuracy score is '+str(accuracy_score(true[0].values, yPred)))
print('r2 score is '+str(r2_score(true[0].values, yPred)))

mean squared error is 0.2
max error is 1
accuracy score is 0.8
r2 score is 0.605263157894737


Apply L1 or L2 regulariser on the logistic regression model developed using the same train and test data used in Q3 and calculate performance of the new model. Compare performance of this model with the performance reported in Q3. Explain the outcome.

In [112]:
logRegl2 = LogisticRegression(penalty = 'l2')
logRegl2.fit(xTrain, np.ravel(yTrain))

yPred = np.ravel(logRegl2.predict(xTest))

print('mean squared error is '+str(mean_squared_error(true[0].values, yPred)))
print('max error is '+str(max_error(true[0].values, yPred)))
print('accuracy score is '+str(accuracy_score(true[0].values, yPred)))
print('r2 score is '+str(r2_score(true[0].values, yPred)))

mean squared error is 0.2
max error is 1
accuracy score is 0.8
r2 score is 0.605263157894737
