In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model

In [2]:
df = pd.read_csv('homeprices.csv')
print(df)

               town  area   price
0   monroe township  2600  550000
1   monroe township  3000  565000
2   monroe township  3200  610000
3   monroe township  3600  680000
4   monroe township  4000  725000
5      west windsor  2600  585000
6      west windsor  2800  615000
7      west windsor  3300  650000
8      west windsor  3600  710000
9       robinsville  2600  575000
10      robinsville  2900  600000
11      robinsville  3100  620000
12      robinsville  3600  695000


In [3]:
dummies = pd.get_dummies(df['town'])
print(dummies)

    monroe township  robinsville  west windsor
0              True        False         False
1              True        False         False
2              True        False         False
3              True        False         False
4              True        False         False
5             False        False          True
6             False        False          True
7             False        False          True
8             False        False          True
9             False         True         False
10            False         True         False
11            False         True         False
12            False         True         False


In [5]:
merged = pd.concat([df, dummies], axis = 'columns')
print(merged)

               town  area   price  monroe township  robinsville  west windsor
0   monroe township  2600  550000             True        False         False
1   monroe township  3000  565000             True        False         False
2   monroe township  3200  610000             True        False         False
3   monroe township  3600  680000             True        False         False
4   monroe township  4000  725000             True        False         False
5      west windsor  2600  585000            False        False          True
6      west windsor  2800  615000            False        False          True
7      west windsor  3300  650000            False        False          True
8      west windsor  3600  710000            False        False          True
9       robinsville  2600  575000            False         True         False
10      robinsville  2900  600000            False         True         False
11      robinsville  3100  620000            False         True 

In [9]:
final = merged.drop(columns = 'town')
print(final)

    area   price  monroe township  robinsville  west windsor
0   2600  550000             True        False         False
1   3000  565000             True        False         False
2   3200  610000             True        False         False
3   3600  680000             True        False         False
4   4000  725000             True        False         False
5   2600  585000            False        False          True
6   2800  615000            False        False          True
7   3300  650000            False        False          True
8   3600  710000            False        False          True
9   2600  575000            False         True         False
10  2900  600000            False         True         False
11  3100  620000            False         True         False
12  3600  695000            False         True         False


In [10]:
 #Here we actually have 3 Dummy variables. Any one of the dummy variables needs to be droped inorder to avoid the dummy variable trap. GOOGLE AFTERWARDS!
final =  final.drop(columns = 'west windsor')
print(final)

    area   price  monroe township  robinsville
0   2600  550000             True        False
1   3000  565000             True        False
2   3200  610000             True        False
3   3600  680000             True        False
4   4000  725000             True        False
5   2600  585000            False        False
6   2800  615000            False        False
7   3300  650000            False        False
8   3600  710000            False        False
9   2600  575000            False         True
10  2900  600000            False         True
11  3100  620000            False         True
12  3600  695000            False         True


In [14]:
X = final.drop(columns = 'price')
print(X)

y = final['price']
print(y)

    area  monroe township  robinsville
0   2600             True        False
1   3000             True        False
2   3200             True        False
3   3600             True        False
4   4000             True        False
5   2600            False        False
6   2800            False        False
7   3300            False        False
8   3600            False        False
9   2600            False         True
10  2900            False         True
11  3100            False         True
12  3600            False         True
0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64


In [15]:
model = linear_model.LinearRegression()
model.fit(X, y)

In [16]:
result = model.predict(X)
print(result)

[539709.7398409  590468.71640508 615848.20468716 666607.18125134
 717366.15781551 579723.71533005 605103.20361213 668551.92431735
 706621.15674048 565396.15136531 603465.38378844 628844.87207052
 692293.59277574]


In [17]:
print(model.predict([[2800, 0, 1]]))

[590775.63964739]




In [18]:
print(model.predict([[3400, 0, 0]]))

[681241.66845839]




In [19]:
print(model.score(X, y))

0.9573929037221872


In [20]:
#We are now going to do the samething(OneHotEncoding) but with the sklearn
print(df)

               town  area   price
0   monroe township  2600  550000
1   monroe township  3000  565000
2   monroe township  3200  610000
3   monroe township  3600  680000
4   monroe township  4000  725000
5      west windsor  2600  585000
6      west windsor  2800  615000
7      west windsor  3300  650000
8      west windsor  3600  710000
9       robinsville  2600  575000
10      robinsville  2900  600000
11      robinsville  3100  620000
12      robinsville  3600  695000


In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [23]:
dfle = df.copy()
dfle['town'] = le.fit_transform(dfle['town'])#Converts the string labels to the numerical labels.
print(dfle)

    town  area   price
0      0  2600  550000
1      0  3000  565000
2      0  3200  610000
3      0  3600  680000
4      0  4000  725000
5      2  2600  585000
6      2  2800  615000
7      2  3300  650000
8      2  3600  710000
9      1  2600  575000
10     1  2900  600000
11     1  3100  620000
12     1  3600  695000


In [30]:
X = dfle.loc[ : ,['town', 'area']]
print(X)

    town  area
0      0  2600
1      0  3000
2      0  3200
3      0  3600
4      0  4000
5      2  2600
6      2  2800
7      2  3300
8      2  3600
9      1  2600
10     1  2900
11     1  3100
12     1  3600


In [31]:
y = dfle.loc[ : , 'price']
print(y)

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64


In [32]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    transformers = [
        ('encoder', OneHotEncoder(), ['town'])
    ],
    remainder = 'passthrough'
)

X = ct.fit_transform(X)
print(X)

[[1.0e+00 0.0e+00 0.0e+00 2.6e+03]
 [1.0e+00 0.0e+00 0.0e+00 3.0e+03]
 [1.0e+00 0.0e+00 0.0e+00 3.2e+03]
 [1.0e+00 0.0e+00 0.0e+00 3.6e+03]
 [1.0e+00 0.0e+00 0.0e+00 4.0e+03]
 [0.0e+00 0.0e+00 1.0e+00 2.6e+03]
 [0.0e+00 0.0e+00 1.0e+00 2.8e+03]
 [0.0e+00 0.0e+00 1.0e+00 3.3e+03]
 [0.0e+00 0.0e+00 1.0e+00 3.6e+03]
 [0.0e+00 1.0e+00 0.0e+00 2.6e+03]
 [0.0e+00 1.0e+00 0.0e+00 2.9e+03]
 [0.0e+00 1.0e+00 0.0e+00 3.1e+03]
 [0.0e+00 1.0e+00 0.0e+00 3.6e+03]]


In [33]:
X = X[ : , 1 : ]
print(X)

[[0.0e+00 0.0e+00 2.6e+03]
 [0.0e+00 0.0e+00 3.0e+03]
 [0.0e+00 0.0e+00 3.2e+03]
 [0.0e+00 0.0e+00 3.6e+03]
 [0.0e+00 0.0e+00 4.0e+03]
 [0.0e+00 1.0e+00 2.6e+03]
 [0.0e+00 1.0e+00 2.8e+03]
 [0.0e+00 1.0e+00 3.3e+03]
 [0.0e+00 1.0e+00 3.6e+03]
 [1.0e+00 0.0e+00 2.6e+03]
 [1.0e+00 0.0e+00 2.9e+03]
 [1.0e+00 0.0e+00 3.1e+03]
 [1.0e+00 0.0e+00 3.6e+03]]


In [34]:
model.fit(X,y)

In [37]:
print(model.predict([[1, 0, 2800]]))

[590775.63964739]


In [38]:
################# EXERCISE ################################

In [47]:
df = pd.read_csv('carprices.csv')
print(df)

                Car Model  Mileage  Sell Price($)  Age(yrs)
0                  BMW X5    69000          18000         6
1                  BMW X5    35000          34000         3
2                  BMW X5    57000          26100         5
3                  BMW X5    22500          40000         2
4                  BMW X5    46000          31500         4
5                 Audi A5    59000          29400         5
6                 Audi A5    52000          32000         5
7                 Audi A5    72000          19300         6
8                 Audi A5    91000          12000         8
9   Mercedez Benz C class    67000          22000         6
10  Mercedez Benz C class    83000          20000         7
11  Mercedez Benz C class    79000          21000         7
12  Mercedez Benz C class    59000          33000         5


In [48]:
temp = pd.get_dummies(df['Car Model'])
print(temp)

    Audi A5  BMW X5  Mercedez Benz C class
0     False    True                  False
1     False    True                  False
2     False    True                  False
3     False    True                  False
4     False    True                  False
5      True   False                  False
6      True   False                  False
7      True   False                  False
8      True   False                  False
9     False   False                   True
10    False   False                   True
11    False   False                   True
12    False   False                   True


In [49]:
df = pd.concat([df,temp], axis = 'columns')
print(df)

                Car Model  Mileage  Sell Price($)  Age(yrs)  Audi A5  BMW X5  \
0                  BMW X5    69000          18000         6    False    True   
1                  BMW X5    35000          34000         3    False    True   
2                  BMW X5    57000          26100         5    False    True   
3                  BMW X5    22500          40000         2    False    True   
4                  BMW X5    46000          31500         4    False    True   
5                 Audi A5    59000          29400         5     True   False   
6                 Audi A5    52000          32000         5     True   False   
7                 Audi A5    72000          19300         6     True   False   
8                 Audi A5    91000          12000         8     True   False   
9   Mercedez Benz C class    67000          22000         6    False   False   
10  Mercedez Benz C class    83000          20000         7    False   False   
11  Mercedez Benz C class    79000      

In [55]:
df = df.drop(columns = ['Mercedez Benz C class'])
print(df)

    Mileage  Sell Price($)  Age(yrs)  Audi A5  BMW X5
0     69000          18000         6    False    True
1     35000          34000         3    False    True
2     57000          26100         5    False    True
3     22500          40000         2    False    True
4     46000          31500         4    False    True
5     59000          29400         5     True   False
6     52000          32000         5     True   False
7     72000          19300         6     True   False
8     91000          12000         8     True   False
9     67000          22000         6    False   False
10    83000          20000         7    False   False
11    79000          21000         7    False   False
12    59000          33000         5    False   False


In [57]:
model = linear_model.LinearRegression()
model.fit(df[['Mileage', 'Age(yrs)', 'Audi A5', 'BMW X5']], df['Sell Price($)'])

In [59]:
model.score(df.drop(columns = ['Sell Price($)']), df['Sell Price($)'])

0.9417050937281082

In [60]:
model.predict([[45000, 4, 0, 0]])



array([36991.31721061])