In this notebook we will use Linear Regression to predict the Z_Score_HelpfulVotes

    ML MODEL: Support Vector Regressor
    TARGET: Z_Score_HelpfulVotes
    SCALER: Standard Scaler
    DATASET: FinalFeatures.csv (Old Dataset)

In [2]:
import numpy as np
import pandas as pd

In [3]:
dataset = pd.read_csv(r'/Users/t_velpac/mission/WorkingCopy/FinalFeatures.csv')

# The below line of code is to keep the z-scores of helpful votes and words and remove the actual values.

print(dataset.columns)
dataset = dataset[['Stars','Z_Score_Words', 'Paragraphs','No.break tags','Percentage_Upper_Case','Percentage_Lower_Case','Avg_len_paragraph_per_review','Z_Score_HelpfulVotes']]
dataset.head()

Index(['Date', 'Stars', 'Helpful Votes', 'Z_Score_HelpfulVotes', 'Words',
       'Z_Score_Words', 'Paragraphs', 'No.break tags', 'Percentage_Upper_Case',
       'Percentage_Lower_Case', 'Avg_len_paragraph_per_review'],
      dtype='object')


Unnamed: 0,Stars,Z_Score_Words,Paragraphs,No.break tags,Percentage_Upper_Case,Percentage_Lower_Case,Avg_len_paragraph_per_review,Z_Score_HelpfulVotes
0,3,6.453577,1,0,3,93,3087.0,-0.235881
1,5,1.394079,3,4,3,91,300.0,0.915696
2,4,3.666459,4,6,4,90,468.5,1.491485
3,4,8.525083,11,20,3,91,394.272727,5.522007
4,5,1.795826,2,1,6,91,492.0,0.339908


In [4]:
X = dataset.iloc[:,0:-1].values
y = dataset.iloc[:,-1].values

In [5]:
"""Splitting the data into training data and testing data"""
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=0)

"""Scaling the data using StandardScaler from sklearn package"""
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [6]:
# importing keras and other required classes
import keras
from sklearn.svm import SVR

Using TensorFlow backend.


In [7]:
""" Creating an object of SVR"""
regressor = SVR()

In [8]:
""" Fitting the model to our training data"""
regressor.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
""" Predicting the values for the TEST DATA"""
y_pred = regressor.predict(X_test)

y_compare = pd.DataFrame()
y_compare['Actual Values'] = y_test
y_compare['Predicted Values'] = y_pred

""" Calculating the Mean Squared Error to estimate the efficiency of the ANN"""
# We are calculating this MSE in two steps. Don't get confused.
y_compare['Mean Squared Error'] = (np.diff(y_compare.values) ** 2)
y_compare['Mean Squared Error'] = np.mean(y_compare['Mean Squared Error'])

""" Now calculating the Root Mean Squared Error(RMSE) """
y_compare['Root Mean Squared Error'] = y_compare['Mean Squared Error']**0.5
y_compare

Unnamed: 0,Actual Values,Predicted Values,Mean Squared Error,Root Mean Squared Error
0,0.101746,0.254126,0.516235,0.718495
1,-0.211198,-0.205036,0.516235,0.718495
2,-0.258092,-0.317339,0.516235,0.718495
3,-0.196210,-0.064771,0.516235,0.718495
4,-0.211198,-0.226142,0.516235,0.718495
5,1.425145,1.243588,0.516235,0.718495
6,-0.211198,-0.173897,0.516235,0.718495
7,-0.235881,-0.208536,0.516235,0.718495
8,-0.211198,-0.232042,0.516235,0.718495
9,-0.258092,-0.303635,0.516235,0.718495


In [10]:
""" Now we will also predict the y values on the training set just to calculate MSE and RMSE """

y_pred_train = regressor.predict(X_train)

""" Creating a dataframe to compare the actual values against the predicted values for the training set """
temp_train = {'Actual Values(Training)':y_train, 'Predicted Values(Training)': y_pred_train }
y_compare_train = pd.DataFrame(temp_train)

""" Calculating the Mean Squared Error to estimate the efficiency of the ANN on TRAINING SET"""
# We are calculating this MSE in two steps. Don't get confused.
y_compare_train['Mean Squared Error'] = (np.diff(y_compare_train.values) ** 2)
y_compare_train['Mean Squared Error'] = np.mean(y_compare_train['Mean Squared Error'])

# Now calculating the Root Mean Squared Error(RMSE)
y_compare_train['Root Mean Squared Error'] = y_compare_train['Mean Squared Error']**0.5
y_compare_train

Unnamed: 0,Actual Values(Training),Predicted Values(Training),Mean Squared Error,Root Mean Squared Error
0,-0.211198,-0.261942,0.289169,0.537745
1,-0.211198,-0.233124,0.289169,0.537745
2,-0.258092,-0.267873,0.289169,0.537745
3,-0.211198,-0.155992,0.289169,0.537745
4,-0.258092,-0.215844,0.289169,0.537745
5,-0.258092,-0.333126,0.289169,0.537745
6,-0.211198,-0.224573,0.289169,0.537745
7,-0.258092,-0.205799,0.289169,0.537745
8,-0.258092,-0.276872,0.289169,0.537745
9,-0.258092,-0.323810,0.289169,0.537745
