In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/homeprices.csv')

In [None]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [None]:
# ML works upon data which is numerical.
# In our dataset there are categorical varaibles.
# Categorical variables are of two types.
# Nominal - Which does not have any order like male, female and cities in state.
# Ordinal - Which has some order like batchelor degree, master degree or phd, it has some order.

In [None]:
# The first step which we will do is to convert these categorical variables into numerical.
# For this we will apply one hot encoding.

In [None]:
# We will create dummy varibles in this one hot encoding. and store it into dummy variables.

In [None]:
dummy = pd.get_dummies(df.town)
dummy

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [None]:
# Next step is to concatenate the dummy varibles into orignal dataframe.

In [None]:
merged = pd.concat([df,dummy], axis = 'columns') # concat methods used to join two data frames in pandas. it takes arrays as input.

In [None]:
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [None]:
# Now our next step would be to drop these orignal town column because we already have our
# numerical variables in form of dummy variables..

# Note: We also need to drop one dummy variables column. Because of the problem of the colinearity.
# Note: Even if we do not drop it, linear regression will drop the one dummy varible automatically
# Because it is aware of the trap which we are talking about. But in general it is a good practice
# to drop it of our own.

In [None]:
final = merged.drop(['town','west windsor'], axis = 'columns')

In [None]:
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
# We need to give x and y for training.
x = final.drop('price', axis = 'columns')
x

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [None]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [None]:
model.fit(x,y)

In [None]:
model.predict([[2800,0,1]])



array([590775.63964739])

In [None]:
model.predict([[3400,0,0]])



array([681241.66845839])

In [None]:
model.score(x,y) # model is 95% accurate.

0.9573929037221872

We can do the same thing using One hot encoding Let us do that but before let me save the model using model.pickle file just for learning purpose.

In [None]:
import pickle
with open('model_pickle', 'wb') as f:
  pickle.dump(model, f)

In [None]:
with open('model_pickle', 'rb') as f:
  mp = pickle.load(f)

In [None]:
mp.predict([[3400,0,0]])



array([681241.66845839])

In [None]:
mp.score(x,y)

0.9573929037221872

Okay, so in above code we have created a model_pickle file and then we have saved our model as mp. Now we will just learn one hot encoding.

In [None]:
df # Orignal dataframe.

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [None]:
from sklearn.preprocessing import LabelEncoder
le  = LabelEncoder()

In [None]:
dfle = df
le.fit_transform(dfle.town) # It takes label column as input.

array([0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1])

In [None]:
# We want to assign it to our column.
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [None]:
x = dfle[['town','area']].values
x

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [None]:
y = dfle.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [None]:
# Create dummy variables here.
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features = [0])

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'categorical_features'

In [None]:
ohe.fit_transform(x,y)