In [1]:
# python version: python3

In [2]:
import numpy as np
import os
np.random.seed(42)
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import tarfile
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

## DATA EXPLORATION & PREPARATION

### part a

In [3]:
#Read data from file.
initdata = pd.read_csv("train.csv")

##
##
##Please Note That: I am doing part B's SalePrice nan drop request here so that before splitting data i can to it all once.
initdata = initdata.dropna(subset=["SalePrice"])
##
##

#I made a general func that seperates whatever I want for multiuse in real life scenarios.
def label_splitter (data,operation):
    if operation == 0:
        without_index = data.drop(columns= ["SalePrice"])
        return without_index
    else:
        only_index = data[["SalePrice"]]
        return only_index
    
train_x_a = label_splitter(initdata,0)
train_y = label_splitter(initdata,1)

In [4]:
train_x_a.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold
0,60,65.0,8450,Pave,7,5,196.0,706,150,856,...,0,2,1,3,1,8,0,2,548,2
1,20,80.0,9600,Pave,6,8,0.0,978,284,1262,...,1,2,0,3,1,6,1,2,460,5


In [5]:
train_y.head(2)

Unnamed: 0,SalePrice
0,208500
1,181500


### part b

In [6]:
#Made this a dict so that i can store both column names and their median values together.
nan_columnsdict = {}

#I made a func that fills the dict column names for key and median values for values that i made for nan_columnsdict. 
def findnan_andreplace (data,writerdict):
    x = data.columns[data.isna().any()].tolist()
    for item in x:
        k = data[item].median()
        writerdict[item] = k
    return data.fillna(value=writerdict)
        
train_x_b = findnan_andreplace (train_x_a,nan_columnsdict)

#Here is the list that you asked for:

nan_columns = list(nan_columnsdict.keys())
print (nan_columnsdict)

{'LotFrontage': 70.0, 'MasVnrArea': 0.0}


In [7]:
print(nan_columns)

['LotFrontage', 'MasVnrArea']


In [8]:
train_x_b.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold
0,60,65.0,8450,Pave,7,5,196.0,706,150,856,...,0,2,1,3,1,8,0,2,548,2
1,20,80.0,9600,Pave,6,8,0.0,978,284,1262,...,1,2,0,3,1,6,1,2,460,5


### part c

In [9]:
categorical_columns = []
#New df with only categorical columns
ctgr_df = train_x_b.select_dtypes(include = "object")
#Gets column names to list
categorical_columns = ctgr_df.columns.tolist()

In [10]:
print(categorical_columns)

['Street']


### part d

In [11]:
#One hot encoding func that uses pandas get_dummies.
def onehot (data,cat_list):
    for i in range(len(cat_list)):
        list_of_elements = data[cat_list[i]].tolist()
        one_hot_coded = pd.get_dummies(list_of_elements)
        data2 = data.drop(columns=[cat_list[i]])
        merged_df =  pd.concat([data2,one_hot_coded],axis=1, join_axes=[data2.index])
    return merged_df    

train_x_d = onehot(train_x_b,categorical_columns)

In [12]:
train_x_d.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Grvl,Pave
0,60,65.0,8450,7,5,196.0,706,150,856,856,...,1,3,1,8,0,2,548,2,0,1
1,20,80.0,9600,6,8,0.0,978,284,1262,1262,...,0,3,1,6,1,2,460,5,0,1


### part e

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def stdscaler (data):
    #StdScaler
    scaler = StandardScaler()
    df = pd.DataFrame(data.astype("float64"))
    df = scaler.fit_transform(df)
    #Converting it back to pd df
    column_names = data.columns[data.any()].tolist()
    dataset_building_dict = {}
    for i in range (len(column_names)):
        dataset_building_dict[column_names[i]] = df[:,i]
    final_dataset = pd.DataFrame(dataset_building_dict)
    return final_dataset
train_x_e = stdscaler (train_x_d)

In [14]:
train_x_e.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Grvl,Pave
0,0.073872,-0.235351,-0.196474,0.6329,-0.529618,0.464035,0.589782,-0.939672,-0.486827,-0.802481,...,1.230454,0.178216,-0.208547,0.935889,-0.939129,0.316364,0.357524,-1.601265,-0.063372,0.063372
1,-0.873204,0.475965,-0.095659,-0.090414,2.177118,-0.576236,1.204925,-0.638291,0.475863,0.280104,...,-0.773542,0.178216,-0.208547,-0.307817,0.600426,0.316364,-0.063938,-0.485919,-0.063372,0.063372


## LINEAR REGRESSION TO PREDICT HOUSE PRICES

### part f

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#I did checking the values here
lin_reg = LinearRegression()
print (lin_reg.fit(train_x_e, train_y))

#This is the linear reg model
lr_model = lin_reg.predict(train_x_e)
lin_mse = mean_squared_error(train_y, lr_model)

lin_rmse = np.sqrt(lin_mse)

#I printed both mse, and rooted mse
print (lin_mse,lin_rmse)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
1040038040.0328164 32249.620773472925


### part g

In [16]:
from sklearn.model_selection import cross_val_score
#5-fold cross validation and finding MSE Score
train_mse_score = cross_val_score(lin_reg, train_x_e,train_y,scoring = "neg_mean_squared_error", cv=5)
print(train_mse_score)

[-9.65633604e+08 -7.44218412e+08 -1.66781305e+09 -1.46841147e+09
 -8.84970495e+08]


In [17]:
average_mse = train_mse_score.mean()
print(average_mse)

-1146209404.8646579


### part h

In [18]:
#Using the same funcs that i used for train data.
initdatatest = pd.read_csv("test.csv")
initdatatest = initdatatest.dropna(subset=["SalePrice"])

test_x = label_splitter(initdatatest,0)
test_y = label_splitter(initdatatest,1)

In [19]:
nan_columnsdicttest = {}
test_x = findnan_andreplace (test_x,nan_columnsdicttest)
nan_columnsdicttest

{'LotFrontage': 68.0, 'MasVnrArea': 0.0}

In [20]:
categorical_columnstest = []
ctgr_df_test = test_x.select_dtypes(include = "object")
categorical_columnstest = ctgr_df_test.columns.tolist()
categorical_columnstest

['Street']

In [21]:
test_x = onehot(test_x,categorical_columnstest)

In [22]:
test_x = stdscaler(test_x)

In [23]:
test_x.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Grvl,Pave
0,-0.871178,0.173992,0.011886,-2.204028,-2.275158,-0.563571,-0.908588,-1.300225,-2.217961,-0.563416,...,-0.73653,-1.088257,-0.218714,-1.547027,-0.978492,0.30186,0.249123,0.236471,-0.066082,0.066082
1,-0.635309,-0.41113,-0.825874,-0.755667,0.401498,-0.563571,-0.908588,0.287067,-0.756925,-1.180596,...,-0.73653,-1.088257,-0.218714,-1.547027,-0.978492,-1.020574,-1.143224,-1.96104,-0.066082,0.066082


In [24]:
test_y.head(2)

Unnamed: 0,SalePrice
0,82000
1,86000


### part i

In [25]:
print (lin_reg.fit(test_x, test_y))

#This is the linear reg model
predicted_values = lin_reg.predict(test_x)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)


In [26]:
print(predicted_values[10:13])

[[107484.31846453]
 [ 88149.13582939]
 [159410.11023751]]


In [27]:
test_mse_score = mean_squared_error(test_y, predicted_values)

print(test_mse_score)

1385750236.38293


## CLASSIFICATION MODEL TO PREDICT HOUSE PRICE CATEGORY

### part j

In [28]:
#Transformation rules:
#house price < 100000  ---> label 1
#100000 <= house price < 200000 ---> label 2
#200000 <= house price < 300000 ---> label 3
#300000 <= house price < 400000 ---> label 4
#400000 <= house price ---> label 5

def num_to_categ (data):
    datax = data.copy()
    i = 5
    datax.loc[datax['SalePrice'] >= 400000] = i
    i = i-1
    datax.loc[datax['SalePrice'] >= 300000] = i
    i = i-1
    datax.loc[datax['SalePrice'] >= 200000] = i
    i = i-1
    datax.loc[datax['SalePrice'] >= 100000] = i
    i = i-1
    datax.loc[datax['SalePrice'] > 5] = i
    return datax

In [29]:
train_y_j = num_to_categ(train_y)
train_y_j.head(2)

Unnamed: 0,SalePrice
0,3
1,2


In [30]:
test_y_j = num_to_categ(test_y)
test_y_j.head(2)

Unnamed: 0,SalePrice
0,1
1,1


### part k

In [31]:
#I will do SGD classification model.
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
#Making label df to list
train_y_k = train_y_j["SalePrice"].tolist()

#ML Model for training, I made max_iter 1000 for better success.
sgd_clf = SGDClassifier(max_iter=1000, random_state=42)
sgd_clf.fit(train_x_e,train_y_k)
model = sgd_clf.fit(train_x_e,train_y_k)

In [32]:
#Here I compare the prediction and real value
print (sgd_clf.predict([train_x_e.iloc[129]]), train_y_k[129] )

[2] 2


### part l

In [33]:
from sklearn.metrics import confusion_matrix as c_m
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

#Getting the scores with y_predict
y_train_pred = cross_val_predict(sgd_clf, train_x_e, train_y_k, cv=5)
accuracy = accuracy_score(train_y_k, y_train_pred)
precision = precision_score(train_y_k, y_train_pred, average = "micro")
recall = recall_score(train_y, y_train_pred, average = "micro")
f1 = f1_score(train_y_k, y_train_pred, average = "micro")
confusion_matrix = c_m(train_y_k, y_train_pred)

In [34]:
print(confusion_matrix)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 

[[ 15  62   0   0   0]
 [  1 593  18   1   1]
 [  0 101 101  18   1]
 [  0   3  34  24   6]
 [  0   0   3   6  12]]
accuracy: 0.745
precision: 0.745
recall: 0.0
f1: 0.745


### part m

In [35]:
#Again made labels to list
test_y_m = test_y_j["SalePrice"].tolist()
#Did predict on test values
predicted_values = sgd_clf.predict(test_x)

In [36]:
print(predicted_values[20:23])

[2 3 2]


In [37]:
#Have the scores
accuracy = accuracy_score(test_y_m, predicted_values)
precision = precision_score(test_y_m, predicted_values, average = "micro")
recall = recall_score(test_y_m, predicted_values, average = "micro")
f1 = f1_score(test_y_m, predicted_values, average = "micro")
confusion_matrix = c_m(test_y_m, predicted_values)

In [38]:
print(confusion_matrix)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 

[[  8  29   0   0   0]
 [  1 282  13   0   1]
 [  0  40  47  12   0]
 [  0   0   8  11   1]
 [  0   0   0   0   7]]
accuracy: 0.7717391304347826
precision: 0.7717391304347826
recall: 0.7717391304347826
f1: 0.7717391304347826
