In this notebook we will use Linear Regression to predict the Z_Score_HelpfulVotes

    ML MODEL: Decision Tree Regressor
    TARGET: Z_Score_HelpfulVotes
    SCALER: Standard Scaler
    DATASET: OriginalFeatures(Corrected).csv

In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv(r"/Users/t_velpac/mission/WorkingCopy/final_csv_files/OriginalFeatures(Corrected).csv")

# The below line of code is to keep the z-scores of helpful votes and words and remove the actual values.

print(dataset.columns)
dataset = dataset[['Stars','Words', 'Paragraphs','No.break tags','Percentage_Upper_Case','Percentage_Lower_Case','Avg_len_paragraph_per_review','Z_Score_HelpfulVotes']]
dataset.head()

Index(['Date', 'Stars', 'Helpful_Votes', 'Words', 'Z_Score_Words',
       'Paragraphs', 'Sentiment_Polarity', 'No.break tags',
       'Percentage_Upper_Case', 'Percentage_Lower_Case',
       'Avg_len_paragraph_per_review', 'Z_Score_Paragraphs',
       'Z_Score_HelpfulVotes'],
      dtype='object')


Unnamed: 0,Stars,Words,Paragraphs,No.break tags,Percentage_Upper_Case,Percentage_Lower_Case,Avg_len_paragraph_per_review,Z_Score_HelpfulVotes
0,4,268,8,14,3,90,166.125,6.515421
1,1,130,2,2,2,94,352.5,3.557805
2,1,110,1,0,3,90,556.0,3.557805
3,5,110,3,4,3,92,188.666667,70.01453
4,1,39,1,0,3,96,216.0,2.347872


In [3]:
X = dataset.iloc[:,0:-1].values
y = dataset.iloc[:,-1].values

In [4]:
"""Splitting the data into training data and testing data"""
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=1)

"""Scaling the data using StandardScaler from sklearn package"""
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [5]:
# importing keras and other required classes
import keras
from sklearn.tree import DecisionTreeRegressor

Using TensorFlow backend.


In [6]:
""" Creating an object of SVR"""
regressor = DecisionTreeRegressor()

In [7]:
""" Fitting the model to our training data"""
regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [8]:
""" Predicting the values for the TEST DATA"""
y_pred = regressor.predict(X_test)

y_compare = pd.DataFrame()
y_compare['Actual Values'] = y_test
y_compare['Predicted Values'] = y_pred

""" Calculating the Mean Squared Error to estimate the efficiency of the Regressor"""
# We are calculating this MSE in two steps. Don't get confused.
y_compare['Mean Squared Error'] = (np.diff(y_compare.values) ** 2)
y_compare['Mean Squared Error'] = np.mean(y_compare['Mean Squared Error'])

""" Now calculating the Root Mean Squared Error(RMSE) """
y_compare['Root Mean Squared Error'] = y_compare['Mean Squared Error']**0.5

# Calculating Mean Absolute Error
y_compare['Mean Abs. Error'] = np.mean(abs(y_compare['Actual Values'] - y_compare['Predicted Values']))

y_compare

Unnamed: 0,Actual Values,Predicted Values,Mean Squared Error,Root Mean Squared Error,Mean Abs. Error
0,-0.051497,-0.061395,1.91056,1.38223,0.157869
1,-0.078543,-0.059136,1.91056,1.38223,0.157869
2,-0.008194,-0.034795,1.91056,1.38223,0.157869
3,-0.091704,-0.091704,1.91056,1.38223,0.157869
4,-0.091704,-0.102283,1.91056,1.38223,0.157869
5,-0.072782,-0.110655,1.91056,1.38223,0.157869
6,-0.051497,-0.059294,1.91056,1.38223,0.157869
7,-0.029573,-0.085480,1.91056,1.38223,0.157869
8,-0.184994,-0.112014,1.91056,1.38223,0.157869
9,-0.048927,-0.066602,1.91056,1.38223,0.157869


In [9]:
""" Now we will also predict the y values on the training set just to calculate MSE and RMSE """

y_pred_train = regressor.predict(X_train)

""" Creating a dataframe to compare the actual values against the predicted values for the training set """
temp_train = {'Actual Values(Training)':y_train, 'Predicted Values(Training)': y_pred_train }
y_compare_train = pd.DataFrame(temp_train)

""" Calculating the Mean Squared Error to estimate the efficiency of the ANN on TRAINING SET"""
# We are calculating this MSE in two steps. Don't get confused.
y_compare_train['Mean Squared Error'] = (np.diff(y_compare_train.values) ** 2)
y_compare_train['Mean Squared Error'] = np.mean(y_compare_train['Mean Squared Error'])

# Now calculating the Root Mean Squared Error(RMSE)
y_compare_train['Root Mean Squared Error'] = y_compare_train['Mean Squared Error']**0.5

# Calculating Mean Absolute Error
y_compare_train['Mean Abs. Error'] = np.mean(abs(y_compare_train['Actual Values(Training)'] - y_compare_train['Predicted Values(Training)']))

y_compare_train

Unnamed: 0,Actual Values(Training),Predicted Values(Training),Mean Squared Error,Root Mean Squared Error,Mean Abs. Error
0,-0.159438,-0.159438,0.013335,0.115479,0.015508
1,-0.379656,-0.379656,0.013335,0.115479,0.015508
2,-0.379656,-0.078067,0.013335,0.115479,0.015508
3,-0.188500,-0.188500,0.013335,0.115479,0.015508
4,-0.050341,-0.069359,0.013335,0.115479,0.015508
5,-0.078494,-0.078494,0.013335,0.115479,0.015508
6,0.048348,0.048348,0.013335,0.115479,0.015508
7,-0.066602,-0.066602,0.013335,0.115479,0.015508
8,-0.032077,-0.032077,0.013335,0.115479,0.015508
9,-0.040846,0.014053,0.013335,0.115479,0.015508
