### Regression Demonstration

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [16]:
salary = pd.read_csv('data/regression-salaries.csv')
salary.head()

Unnamed: 0,Position,Level,Salary
0,Business Analyst,1,45000
1,Junior Consultant,2,50000
2,Senior Consultant,3,60000
3,Manager,4,80000
4,Country Manager,5,110000


In [17]:
startup = pd.read_csv('data/multiple-linear-startup.csv')
startup.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [18]:
startup.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [19]:
startup = startup.rename(columns={"R&D Spend": "Rnd", "Marketing Spend": "Mktg", "Administration" : "Admin"})
startup.head()

Unnamed: 0,Rnd,Admin,Mktg,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


### Preprocessing
* Check for Missing Value
* Encode any categorical values 
* split the data into ratio of 65:35 for training and testing

In [20]:
# Checking for Missing Values
np.sum(startup.isna())

Rnd       0
Admin     0
Mktg      0
State     0
Profit    0
dtype: int64

WE have multiple techniques to encode the categorical data 

* Label Encoding or Ordinal Encoding
* One hot Encoding  ( For Each category - a column/feature is created)
* Dummy Encoding ( Similar to one hot but uses n-1 features for n categories)
* Effect Encoding
* Binary Encoding
* BaseN Encoding
* Hash Encoding
* Target Encoding

In [21]:
startup_encoded = pd.get_dummies(data=startup, drop_first=False)
startup_encoded.head()

Unnamed: 0,Rnd,Admin,Mktg,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


### One Hot Encoding using sklearn

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

coltfr = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
# What to do - Encoding
# Which type of encoding ? One hot
# Index of columne which needs transfomration
# remainder - tells what to do for the columns which are not encoded. 

encoded_startup = np.array(coltfr.fit_transform(startup))

In [30]:
print(encoded_startup)[10]

[[0.0000000e+00 0.0000000e+00 1.0000000e+00 1.6534920e+05 1.3689780e+05
  4.7178410e+05 1.9226183e+05]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.6259770e+05 1.5137759e+05
  4.4389853e+05 1.9179206e+05]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.5344151e+05 1.0114555e+05
  4.0793454e+05 1.9105039e+05]
 [0.0000000e+00 0.0000000e+00 1.0000000e+00 1.4437241e+05 1.1867185e+05
  3.8319962e+05 1.8290199e+05]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.4210734e+05 9.1391770e+04
  3.6616842e+05 1.6618794e+05]
 [0.0000000e+00 0.0000000e+00 1.0000000e+00 1.3187690e+05 9.9814710e+04
  3.6286136e+05 1.5699112e+05]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.3461546e+05 1.4719887e+05
  1.2771682e+05 1.5612251e+05]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.3029813e+05 1.4553006e+05
  3.2387668e+05 1.5575260e+05]
 [0.0000000e+00 0.0000000e+00 1.0000000e+00 1.2054252e+05 1.4871895e+05
  3.1161329e+05 1.5221177e+05]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.2333488e+05 1.0867917e+05
 

TypeError: 'NoneType' object is not subscriptable

In [35]:
df = pd.DataFrame(encoded_startup, 
                  columns =['State_California','State_Florida','State_Newyork','Rnd','Admin','mktg','profit'] )  

df.head()

Unnamed: 0,State_California,State_Florida,State_Newyork,Rnd,Admin,mktg,profit
0,0.0,0.0,1.0,165349.2,136897.8,471784.1,192261.83
1,1.0,0.0,0.0,162597.7,151377.59,443898.53,191792.06
2,0.0,1.0,0.0,153441.51,101145.55,407934.54,191050.39
3,0.0,0.0,1.0,144372.41,118671.85,383199.62,182901.99
4,0.0,1.0,0.0,142107.34,91391.77,366168.42,166187.94


In [None]:
.