In [834]:
import pandas as pd
import numpy as np


In [835]:
# کامنت های فارسی مشکل راست به چپ داشت، بعضی جاها انگلیسی نوشتم
# because we are using Pandas, and it requires a dictionary of 
# types we add this line to our code 

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
              'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 
              'date':str,'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [836]:
# دیتای خود را ابتدا میخوانیم و به
# sales, traning data, test data
# تقسیم میکنیم تا مدل را بر اساس آن پایه گذاری کنیم. 
# sales قیمت واقعی فروش رفته خانه

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)


In [837]:
# Let's first see what we have	 
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [838]:
# first, we need a method to predict our percieved sales price based on an "input_feature" that 
# will be "sqft_living" later. we only use "sqft_living" since our model is "simple linear regression"
# we denote w0 with intercept (arz az mabda) and w1 with slope(shib) 
# here "predicted value" is the same as f(x) in the lecture slides
def get_predictions(input_feature, intercept, slope):
    # calculate the predicted values:
    predicted_values = intercept + (slope * input_feature)
    return predicted_values

In [839]:
# برای پیدا کردن بهترین خط به محاسبه RSS نیاز داریم
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    # ابتدا قیمت پیش بینی شده خانه را با توجه به شیب و عرض از مبدا رندم بدست می آوریم
    predicted_values = intercept + (slope * input_feature)
    # then we compute the residuals 
    residuals = output - predicted_values
    # first square the residuals 
    # then add them together
    RSS = (residuals * residuals).sum()
    return(RSS)

In [841]:
# میدانیم بهترین خط خطی است که کم ترین آر اس اس را داشته باشد 
# در یک حلقه برای تمام شیب ها و عرض از مبدا هایی که تولید کردیم، آر اس اس بدست می آوریم و
#  کمترین RSS را پیدا می کنیم

RSS_list = []
min = 1.7976931048623157e+308
i = 1
while i < 200:
    
    # produce random intercept and slope
    df_intercept = np.random.normal(0,1000000)
    df_slope = np.random.normal(0,1000000)
    # calculate the RSS for this particular line
    rss_prices_on_sqft = get_residual_sum_of_squares(test_data['sqft_living'].values,
                                                     test_data['price'].values,
                                                     df_intercept,
                                                     df_slope)
    # checking whether the minimum RSS has changed
    if (rss_prices_on_sqft<min):
        min = rss_prices_on_sqft
        intercept = df_intercept
        slope = df_slope
    
    # keeping a record of all our calculated RSS     
    RSS_list.append(rss_prices_on_sqft)
    i += 1

print("\n")
# finding the lowest RSS (and as a result best line))
# ممکن است معادله خط نهایی شامل ضرایب بزرگ باشد اما با چند بار تست آر اس اس میانگین 
# نزدیک به آر اس اس مورد نظر دانشگاه واشنگتن میباشد
print("Lowest RSS is " + str(min))
print("The equation of the best line: " +str("%+.2f" %slope) +"x"+ str("%+.2f" %intercept))





Here is the list for all computed RSS
[1.789137090676093e+23, 3.582716844592532e+22, 1.9198549771050885e+22, 1.6173245819889283e+22, 7.027015110160076e+21, 9.97705248944448e+21, 3.5154469576354887e+21, 5.057245858834231e+21, 1.4960864909583157e+22, 2.861997604050873e+22, 1.4267069954015764e+21, 1.0148510412045466e+22, 1.0309006095332128e+22, 4.587859135703266e+22, 1.898471934104854e+22, 2.4900460080806253e+21, 1.3924103982924475e+23, 9.315583715087931e+21, 6.490714849954045e+22, 2.1966058482502356e+22, 5.404940672321679e+20, 4.872339348236772e+22, 4.552081239789467e+21, 1.5686694886353618e+23, 1.3005774423441063e+22, 2.0203886058304113e+22, 6.783433640349855e+22, 6.274985564808934e+20, 1.9520581736690814e+21, 3.3613042458513994e+21, 1.3630780746332618e+23, 6.937029683826714e+17, 7.748263870794065e+22, 3.3955932223922724e+22, 1.1240710368137377e+22, 1.3618623127850525e+22, 2.522645322968589e+22, 9.281086081244318e+22, 3.4721884312736314e+21, 1.7775481807118403e+22, 3.9842499287964755e+2