In [32]:
# python version: python3

In [33]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn

## DATA EXPLORATION & PREPARATION

### part a

In [34]:
train_x_a = pd.read_csv("train.csv").drop("SalePrice", axis=1) #loading data without SalePrice column
train_y = pd.read_csv("train.csv")["SalePrice"] #loading data only SalePrice column

In [35]:
train_x_a.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold
0,60,65.0,8450,Pave,7,5,196.0,706,150,856,...,0,2,1,3,1,8,0,2,548,2
1,20,80.0,9600,Pave,6,8,0.0,978,284,1262,...,1,2,0,3,1,6,1,2,460,5


In [36]:
train_y.head(2)

0    208500
1    181500
Name: SalePrice, dtype: int64

### part b

In [37]:
mydata =pd.read_csv("train.csv")
nan_columns = mydata.columns[mydata.isnull().any()].tolist() #taking missing values to list
train_x_b = mydata.fillna(mydata.mean()) #filling this list with mean of mydata

In [38]:
print(nan_columns)

['LotFrontage', 'MasVnrArea']


In [39]:
train_x_b.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,SalePrice
0,60,65.0,8450,Pave,7,5,196.0,706,150,856,...,2,1,3,1,8,0,2,548,2,208500
1,20,80.0,9600,Pave,6,8,0.0,978,284,1262,...,2,0,3,1,6,1,2,460,5,181500


### part c

In [40]:
colsrows= mydata.select_dtypes(include=['object'])   #finding columns which have string values in it's rows
categorical_columns = colsrows.columns

In [41]:
print(categorical_columns)

Index([u'Street'], dtype='object')


### part d

In [42]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,MinMaxScaler 

In [43]:
le = LabelEncoder()
categorical_columns_encoded = le.fit_transform(colsrows)  #first I do label encoding then go for one hot encoding
encoder = OneHotEncoder(sparse=False)
categorical_columns_encoded = categorical_columns_encoded.reshape(len(categorical_columns_encoded), 1) 
encoded_1hot = encoder.fit_transform(categorical_columns_encoded) # in this way we get what we wanted like 2 columns of Grvl and Pave and their values as 1 or 0
print encoded_1hot

[[ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 ..., 
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]]


In [44]:
train_x_d= pd.DataFrame(data = encoded_1hot, columns=["Grvl","Pave"]) #converted arrays to datafrome 

In [45]:
train_x_d.head(2)

Unnamed: 0,Grvl,Pave
0,0.0,1.0
1,0.0,1.0


### part e

In [46]:

scaler = MinMaxScaler() # it's a scaling function

train_x_d_scaled = pd.DataFrame(scaler.fit_transform(train_x_d), columns=train_x_d.columns) #creating new dataframe as a scaled version

train_x_e = train_x_d_scaled

In [47]:
train_x_e.head(2)

Unnamed: 0,Grvl,Pave
0,0.0,1.0
1,0.0,1.0


## LINEAR REGRESSION TO PREDICT HOUSE PRICES

### part f

In [48]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lin_reg = LinearRegression()
lr_model = lin_reg.fit(train_x_e, train_y) 
print lr_model
housing_predictions = lin_reg.predict(train_x_e) #predicting house prices with scaled training data
lin_mse = mean_squared_error(train_y, housing_predictions) #figure mse out 
lin_mse

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)


6428554787.008605

# part g

In [49]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42) 
forest_reg.fit(train_x_e, train_y)  # training RandomForestRegressor
scores = cross_val_score(forest_reg, train_x_e, train_y,
scoring="neg_mean_squared_error", cv=5) #performing 5-fold cross validation here
train_mse_score = np.sqrt(-scores) #having mse score
print(train_mse_score)

[ 76284.22019438  77497.40460278  81682.04431142  91229.67138889
  73381.44854695]


In [50]:
average_mse = train_mse_score.mean()  # average of train mean square error
print(average_mse)

80014.9578089


### part h

In [51]:
test_x = pd.read_csv("test.csv").drop("SalePrice", axis=1) # loading data as I did above without SalePrice
test_y =pd.read_csv("test.csv")["SalePrice"]  # loading data as I did above only SalePrice column
nan_columns = test_x.columns[test_x.isnull().any()].tolist()  #looking for missing values and appending them to list
test_x = test_x.fillna(test_x.mean()) #filling these missing values with mean of data
colsrowstest= test_x.select_dtypes(include=['object'])  #selecting data which has string values
categorical_columns_test = colsrowstest.columns 
categorical_columns_encoded_test = le.fit_transform(colsrowstest) #doing label encoding here
categorical_columns_encoded_test = categorical_columns_encoded_test.reshape(len(categorical_columns_encoded_test), 1) #reshaping label encoded data
encoded_1hot_test = encoder.fit_transform(categorical_columns_encoded_test) # one hot encoding here
test_x = pd.DataFrame(data = encoded_1hot_test, columns=["Grvl","Pave"]) #havig a dataframe has columns as Grvl and Pave and their values as a 1 or 0
test_x = pd.DataFrame(scaler.fit_transform(test_x), columns=test_x.columns) # scaling the new data



In [52]:
test_x.head(2) #first 2 rows of new data

Unnamed: 0,Grvl,Pave
0,0.0,1.0
1,0.0,1.0


In [53]:
test_y.head(2)

0    82000
1    86000
Name: SalePrice, dtype: int64

### part i

In [54]:
predicted_values = lin_reg.predict(test_x) #predicting values by using linear regression

In [55]:
print(predicted_values[10:13])

[ 182501.01204819  182501.01204819  182501.01204819]


In [56]:
lin_mse = mean_squared_error(test_y, predicted_values) #having mean square error
print(lin_mse)

6014352594.27


## CLASSIFICATION MODEL TO PREDICT HOUSE PRICE CATEGORY

### part j

In [57]:
for i in range(len(train_y)):  # here is i'm doing segmentation by using for loop, so i'm going into rows one by one
    if train_y.loc[i] <100000: # and converting their values based on their range
        train_y.loc[i] = 1
    if train_y.loc[i] in range(100000, 200000):
        train_y.loc[i] = 2
    if train_y.loc[i] in range(200000, 300000):
        train_y.loc[i] = 3
    if train_y.loc[i] in range(300000, 400000):
        train_y.loc[i] = 4
    if train_y.loc[i] >= 400000:
        train_y.loc[i] = 5
print(train_y)

0      3
1      2
2      3
3      2
4      3
5      2
6      4
7      3
8      2
9      2
10     2
11     4
12     2
13     3
14     2
15     2
16     2
17     1
18     2
19     2
20     4
21     2
22     3
23     2
24     2
25     3
26     2
27     4
28     3
29     1
      ..
970    2
971    2
972    1
973    2
974    2
975    2
976    1
977    2
978    2
979    2
980    2
981    4
982    2
983    3
984    2
985    2
986    2
987    4
988    2
989    2
990    4
991    2
992    2
993    2
994    4
995    2
996    2
997    2
998    1
999    3
Name: SalePrice, dtype: int64


In [58]:
for k in range(len(test_y)):  #i'm doing same thing as i did above
    if test_y.loc[k] <100000:
        test_y.loc[k] = 1
    if test_y.loc[k] in range(100000, 200000):
        test_y.loc[k] = 2
    if test_y.loc[k] in range(200000, 300000):
        test_y.loc[k] = 3
    if test_y.loc[k] in range(300000, 400000):
        test_y.loc[k] = 4
    if test_y.loc[k] >= 400000:
        test_y.loc[k] = 5
print (test_y)

0      1
1      1
2      3
3      2
4      2
5      2
6      2
7      1
8      3
9      2
10     2
11     2
12     2
13     1
14     2
15     3
16     3
17     2
18     2
19     3
20     2
21     2
22     1
23     2
24     3
25     2
26     2
27     3
28     2
29     2
      ..
430    2
431    2
432    1
433    2
434    2
435    2
436    2
437    4
438    2
439    2
440    2
441    2
442    4
443    2
444    2
445    2
446    2
447    3
448    2
449    1
450    2
451    3
452    2
453    1
454    2
455    2
456    3
457    3
458    2
459    2
Name: SalePrice, dtype: int64


train_y = train_y_j
test_y =test_y_j

In [59]:
train_y.head(2)

0    3
1    2
Name: SalePrice, dtype: int64

In [60]:
test_y.head(2)

0    1
1    1
Name: SalePrice, dtype: int64

### part k

In [64]:
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier # i'm training SGDC classifier here for prediction
sgd_clf = linear_model.SGDClassifier(random_state=42)
sgd_clf.fit(train_x_e, train_y)
prediction = sgd_clf.predict(train_x_e) # here i have a prediction for training data


### part l

In [65]:
from sklearn.metrics import precision_score, recall_score,f1_score,confusion_matrix
scores = cross_val_score(forest_reg, train_x_e, train_y, cv=5,)  # here i do 5-fold cross validation
accuracy = scores #having an accuracy
precision = precision_score(train_y, prediction ,average = "micro" )  # this function for precision
recall = recall_score(train_y, prediction, average = "micro") # this functionfor recall
f1 = f1_score(train_y, prediction, average = "micro") #this function f1 score
confusion_matrix = confusion_matrix(train_y, prediction) # and this function for confusion matrix

In [66]:
print(confusion_matrix)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 

[[  0   1  76   0   0]
 [  0   2 612   0   0]
 [  0   1 220   0   0]
 [  0   0  67   0   0]
 [  0   0  21   0   0]]
accuracy: [-0.00356612 -0.00775631 -0.00041636 -0.00175955 -0.01848357]
precision: 0.222
recall: 0.222
f1: 0.222


### part m

In [67]:
sgd_clf = linear_model.SGDClassifier(random_state=42)
sgd_clf.fit(test_x, test_y)
predicted_values = sgd_clf.predict(test_x) #here ı do prediction for test data with same model

In [68]:
print(predicted_values[20:23])

[2 2 2]


In [69]:
scores_test = cross_val_score(forest_reg, test_x, test_y, cv=5,) # i'm doing same things here as i did above
accuracy = scores_test
precision = precision_score(test_y, predicted_values ,average = "micro" )
recall = recall_score(test_y, predicted_values, average = "micro")
f1 = f1_score(test_y, predicted_values, average = "micro")
#confusion_matrix = confusion_matrix(test_y, predicted_values)

In [70]:
print(confusion_matrix)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 

[[  0   1  76   0   0]
 [  0   2 612   0   0]
 [  0   1 220   0   0]
 [  0   0  67   0   0]
 [  0   0  21   0   0]]
accuracy: [ -1.35518070e-02  -1.17811115e-02   1.57598214e-04  -1.57885859e-02
  -8.12707033e-06]
precision: 0.645652173913
recall: 0.645652173913
f1: 0.645652173913
