In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sea
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from prettytable import PrettyTable

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [26]:
data = pd.read_csv('/Users/ankusmanish/Desktop/Training/Datasets/Week9/50_Startups.csv')

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [28]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [29]:
data['State'].value_counts()

New York      17
California    17
Florida       16
Name: State, dtype: int64

In [30]:
sc = StandardScaler()

In [31]:
col = ['R&D Spend', 'Administration', 'Marketing Spend']
mod_data = sc.fit_transform(data[col])
data[col] = mod_data

In [32]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values
y = np.reshape(y, (len(y),1))

In [33]:
lb = LabelEncoder()

In [34]:
X[:,3] = lb.fit_transform(X[:,3])

In [35]:
df = pd.DataFrame(X)

In [36]:
ohe = OneHotEncoder(categorical_features = [3])

In [37]:
X = ohe.fit_transform(X).toarray()

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [39]:
algos = {'DecisionTreeRegressor':DecisionTreeRegressor(), 'RandomForestRegressor':RandomForestRegressor(), 
         'Support Vector Regressor':SVR(), 'Linear Regression':LinearRegression()}

In [40]:
def models(X_train,y_train, X_test, y_test):
    t = PrettyTable(['Classifier', 'RMSE'])
    
    for key, value in algos.items():
        model = value
        
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        print()

        print(key.upper(), '\n')
        t.add_row([key, np.sqrt(mean_squared_error(y_test, y_pred))])
        print('MAE : {}'.format(mean_absolute_error(y_test, y_pred)))
        print('MSE : {}'.format(mean_squared_error(y_test, y_pred)))
        print('RMSE : {}'.format(np.sqrt(mean_squared_error(y_test, y_pred))))
        print('\n')
        print('-' * 100)
    print(t)


In [41]:
models(X_train, y_train, X_test, y_test)


DECISIONTREEREGRESSOR 

MAE : 12941.117333333334
MSE : 325232961.81175995
RMSE : 18034.21641801384


----------------------------------------------------------------------------------------------------

RANDOMFORESTREGRESSOR 

MAE : 7076.3592000000035
MSE : 78798508.1962492
RMSE : 8876.852381123008


----------------------------------------------------------------------------------------------------

SUPPORT VECTOR REGRESSOR 

MAE : 30119.35504163384
MSE : 1434558042.7992117
RMSE : 37875.55996680725


----------------------------------------------------------------------------------------------------

LINEAR REGRESSION 

MAE : 7395.433531523906
MSE : 84826955.035334
RMSE : 9210.154995185152


----------------------------------------------------------------------------------------------------
+--------------------------+-------------------+
|        Classifier        |        RMSE       |
+--------------------------+-------------------+
|  DecisionTreeRegressor   | 18034.21641801384 |
