In this notebook we will use Linear Regression to predict the Z_Score_HelpfulVotes

    ML MODEL: Polynomial Regression
    TARGET: Z_Score_HelpfulVotes
    SCALER: Standard Scaler
    DATASET: FinalFeatures_latest.csv (New Dataset)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset = pd.read_csv(r"/Users/t_velpac/mission/WorkingCopy/final_csv's/FinalFeatures_latest.csv")

# The below line of code is to keep the z-scores of helpful votes and words and remove the actual values.

print(dataset.columns)
dataset = dataset[['Stars','Z_Score_Words', 'Paragraphs','No.break tags','Percentage_Upper_Case','Percentage_Lower_Case','Avg_len_paragraph_per_review','Z_Score_HelpfulVotes']]
dataset.head()

Index(['Date', 'Stars', 'Helpful Votes', 'Words', 'Z_Score_Words',
       'Paragraphs', 'No.break tags', 'Percentage_Upper_Case',
       'Percentage_Lower_Case', 'Avg_len_paragraph_per_review',
       'Z_Score_HelpfulVotes'],
      dtype='object')


Unnamed: 0,Stars,Z_Score_Words,Paragraphs,No.break tags,Percentage_Upper_Case,Percentage_Lower_Case,Avg_len_paragraph_per_review,Z_Score_HelpfulVotes
0,5,2.548167,9,11,4,89,202.555556,15.173098
1,3,-0.066097,1,0,11,79,284.0,0.219167
2,5,0.72121,1,0,1,96,701.0,0.068058
3,5,1.034114,1,0,4,92,862.0,0.056435
4,5,0.489055,2,1,5,89,280.5,0.056435


Separating the independant variables from the dependant variable which is "Helpful Votes" in this case

In [3]:
X = dataset.iloc[:,0:-1].values
y = dataset.iloc[:,-1].values

""" We will transform the data into Polynomial Features using PolynomialFeatures from sklearn"""
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures()
X_poly = poly_reg.fit_transform(X)

In [4]:
"""Splitting the data into training data and testing data"""
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_poly,y, test_size=0.2,random_state=1)

"""Scaling the data using StandardScaler from sklearn package"""
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [5]:
""" Fitting Polynomial Regression to the training dataset"""

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [6]:
""" Predicting the values for the TEST DATA"""
y_pred = lin_reg.predict(X_test)

y_compare = pd.DataFrame()
y_compare['Actual Values'] = y_test
y_compare['Predicted Values'] = y_pred

""" Calculating the Mean Squared Error to estimate the efficiency of the ANN"""
# We are calculating this MSE in two steps. Don't get confused.
y_compare['Mean Squared Error'] = (np.diff(y_compare.values) ** 2)
y_compare['Mean Squared Error'] = np.mean(y_compare['Mean Squared Error'])

""" Now calculating the Root Mean Squared Error(RMSE) """
y_compare['Root Mean Squared Error'] = y_compare['Mean Squared Error']**0.5

# Calculating Mean Absolute Error
y_compare['Mean Abs. Error'] = np.mean(abs(y_compare['Actual Values'] - y_compare['Predicted Values']))

y_compare

Unnamed: 0,Actual Values,Predicted Values,Mean Squared Error,Root Mean Squared Error,Mean Abs. Error
0,-0.036555,-0.070526,1.110849,1.053968,0.071857
1,-0.019120,0.027921,1.110849,1.053968,0.071857
2,-0.036555,-0.035635,1.110849,1.053968,0.071857
3,-0.013308,-0.010488,1.110849,1.053968,0.071857
4,-0.036555,-0.031237,1.110849,1.053968,0.071857
5,-0.030743,-0.016257,1.110849,1.053968,0.071857
6,-0.036555,-0.012210,1.110849,1.053968,0.071857
7,-0.036555,-0.018047,1.110849,1.053968,0.071857
8,-0.019120,-0.036841,1.110849,1.053968,0.071857
9,-0.013308,-0.027614,1.110849,1.053968,0.071857


In [7]:
""" Now we will also predict the y values on the training set just to calculate MSE and RMSE """

y_pred_train = lin_reg.predict(X_train)

""" Creating a dataframe to compare the actual values against the predicted values for the training set """
temp_train = {'Actual Values(Training)':y_train, 'Predicted Values(Training)': y_pred_train }
y_compare_train = pd.DataFrame(temp_train)

""" Calculating the Mean Squared Error to estimate the efficiency of the ANN on TRAINING SET"""
# We are calculating this MSE in two steps. Don't get confused.
y_compare_train['Mean Squared Error'] = (np.diff(y_compare_train.values) ** 2)
y_compare_train['Mean Squared Error'] = np.mean(y_compare_train['Mean Squared Error'])

# Now calculating the Root Mean Squared Error(RMSE)
y_compare_train['Root Mean Squared Error'] = y_compare_train['Mean Squared Error']**0.5

# Calculating Mean Absolute Error
y_compare_train['Mean Abs. Error'] = np.mean(abs(y_compare_train['Actual Values(Training)'] - y_compare_train['Predicted Values(Training)']))

y_compare_train

Unnamed: 0,Actual Values(Training),Predicted Values(Training),Mean Squared Error,Root Mean Squared Error,Mean Abs. Error
0,-0.036555,-0.035049,0.962413,0.981026,0.067304
1,-0.036555,-0.041759,0.962413,0.981026,0.067304
2,-0.036555,0.038025,0.962413,0.981026,0.067304
3,-0.036555,-0.035330,0.962413,0.981026,0.067304
4,-0.036555,-0.042061,0.962413,0.981026,0.067304
5,-0.036555,-0.019687,0.962413,0.981026,0.067304
6,-0.036555,-0.009291,0.962413,0.981026,0.067304
7,-0.036555,-0.058170,0.962413,0.981026,0.067304
8,-0.007496,-0.045981,0.962413,0.981026,0.067304
9,-0.036555,-0.022662,0.962413,0.981026,0.067304
