In [202]:
import pandas as pd
import numpy as np


In [203]:
# کامنت های فارسی مشکل راست به چپ داشت، بعضی جاها انگلیسی نوشتم
# because we are using Pandas, and it requires a dictionary of 
# types we add this line to our code 

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
              'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 
              'date':str,'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [204]:
# دیتای خود را ابتدا میخوانیم و به
# sales, traning data, test data
# تقسیم میکنیم تا مدل را بر اساس آن پایه گذاری کنیم. 
# sales قیمت واقعی فروش رفته خانه

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)


In [205]:
# Let's first see what we have	 
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [206]:
# first, we need a method to predict our percieved sales price based on an "input_feature" that 
# will be "sqft_living" later. we only use "sqft_living" since our model is "simple linear regression"
# we denote w0 with intercept (arz az mabda) and w1 with slope(shib) 
# here "predicted value" is the same as f(x) in the lecture slides
def get_predictions(input_feature, intercept, slope):
    # calculate the predicted values:
    predicted_values = intercept + (slope * input_feature)
    return predicted_values

In [207]:
# برای پیدا کردن بهترین خط به محاسبه RSS نیاز داریم
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    # ابتدا قیمت پیش بینی شده خانه را با توجه به شیب و عرض از مبدا رندم بدست می آوریم
    predicted_values = intercept + (slope * input_feature)
    # then we compute the residuals 
    residuals = output - predicted_values
    # first square the residuals 
    # then add them together
    RSS = (residuals * residuals).sum()
    return(RSS)

In [208]:
# using numpy library, we generate random intercepts and slopes
# then using pandas library, we convert it to a DataFrame to feed to our model
df_intercept = pd.DataFrame (np.random.normal(0,1,(200,1)))
df_slope = pd.DataFrame (np.random.normal(0,1,(200,1)))

print(df_intercept)
print(df_slope)

            0
0    0.503102
1    0.975790
2   -1.018151
3   -0.923664
4   -1.050806
..        ...
195  1.405090
196 -0.948360
197 -0.124653
198  0.297558
199  0.198115

[200 rows x 1 columns]
            0
0    1.906706
1    0.338056
2    1.579914
3    0.673328
4    0.793383
..        ...
195  0.292915
196  0.158281
197 -0.902946
198  0.363289
199  0.978781

[200 rows x 1 columns]


In [209]:
# میدانیم بهترین خط خطی است که کم ترین آر اس اس را داشته باشد 
# در یک حلقه برای تمام شیب ها و عرض از مبدا هایی که تولید کردیم، آر اس اس بدست می آوریم و
#  کمترین RSS را پیدا می کنیم

RSS_list = []
i = 1
while i < 200:


    rss_prices_on_sqft = get_residual_sum_of_squares(test_data['sqft_living'].values,
                                                     test_data['price'].values,
                                                     df_intercept.values, df_slope.values)
    RSS_list.append(rss_prices_on_sqft)
    i += 1

print("Here is the list for all computed RSS\n" + str(RSS_list))
print("\n\n")
# finding the lowest RSS (and as a result best line))
print("Lowest RSS is " + str(min(RSS_list)))





Here is the list for all computed RSS
[3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e+17, 3.569968614582315e