In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset = pd.read_csv(r'/Users/t_velpac/mission/WorkingCopy/Features_V3.csv')
dataset.drop(columns = ['Z_Score_HelpfulVotes','Z_Score_Words'], inplace=True)
dataset.rename(index=str, columns={"Helpful Votes": "Helpful_Votes"}, inplace=True)
dataset = dataset[dataset.Helpful_Votes != 0]


"""
Calculating Z-Scores again for Helpful Votes and Words
"""
dataset['Z_Score_Words'] = (dataset['Words'] - dataset['Words'].mean()) / dataset['Words'].std(ddof=0)
dataset['Z_Score_Helpful_Votes'] = (dataset['Helpful_Votes'] - dataset['Helpful_Votes'].mean()) / dataset['Helpful_Votes'].std(ddof=0)

"""
Dropping Helpful Votes and Words columns since we have their z-scores
"""
dataset.drop(columns = ["Helpful_Votes", "Words", "Date"], inplace=True)
dataset.head()

Unnamed: 0,Stars,Paragraphs,No.break tags,Percentage_Upper_Case,Percentage_Lower_Case,Avg_len_paragraph_per_review,Z_Score_Words,Z_Score_Helpful_Votes
0,4,8,14,3,90,166.125,1.321147,0.550824
1,1,2,2,2,94,352.5,0.255207,0.263646
2,1,1,0,3,90,556.0,0.100722,0.263646
3,5,3,4,3,92,188.666667,0.100722,6.522436
4,1,1,0,3,96,216.0,-0.447696,0.153843


In [3]:
X = dataset.iloc[:,0:-1].values
y = dataset.iloc[:,-1].values

""" We will transform the data into Polynomial Features using PolynomialFeatures from sklearn"""
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures()
X_poly = poly_reg.fit_transform(X)

In [4]:
"""Splitting the data into training data and testing data"""
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_poly,y, test_size=0.2,random_state=0)

"""Scaling the data using StandardScaler from sklearn package"""
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [5]:
""" Fitting Polynomial Regression to the training dataset"""

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [6]:
""" Predicting the values for the TEST DATA"""
y_pred = lin_reg.predict(X_test)

y_compare = pd.DataFrame()
y_compare['Actual Values'] = y_test
y_compare['Predicted Values'] = y_pred

""" Calculating the Mean Squared Error to estimate the efficiency of the ANN"""
# We are calculating this MSE in two steps. Don't get confused.
y_compare['Mean Squared Error'] = (np.diff(y_compare.values) ** 2)
y_compare['Mean Squared Error'] = np.mean(y_compare['Mean Squared Error'])

""" Now calculating the Root Mean Squared Error(RMSE) """
y_compare['Root Mean Squared Error'] = y_compare['Mean Squared Error']**0.5

# Calculating Mean Absolute Error
y_compare['Mean Abs. Error'] = np.mean(abs(y_compare['Actual Values'] - y_compare['Predicted Values']))

y_compare

Unnamed: 0,Actual Values,Predicted Values,Mean Squared Error,Root Mean Squared Error,Mean Abs. Error
0,-0.010862,0.033460,1.380752,1.175054,0.113286
1,-0.053094,-0.011599,1.380752,1.175054,0.113286
2,-0.044648,-0.053855,1.380752,1.175054,0.113286
3,-0.057317,-0.027391,1.380752,1.175054,0.113286
4,-0.057317,-0.035353,1.380752,1.175054,0.113286
5,-0.057317,-0.028186,1.380752,1.175054,0.113286
6,-0.053094,-0.053520,1.380752,1.175054,0.113286
7,-0.057317,-0.033964,1.380752,1.175054,0.113286
8,-0.057317,-0.036673,1.380752,1.175054,0.113286
9,0.314325,-0.027957,1.380752,1.175054,0.113286


In [7]:
""" Now we will also predict the y values on the training set just to calculate MSE and RMSE """

y_pred_train = lin_reg.predict(X_train)

""" Creating a dataframe to compare the actual values against the predicted values for the training set """
temp_train = {'Actual Values(Training)':y_train, 'Predicted Values(Training)': y_pred_train }
y_compare_train = pd.DataFrame(temp_train)

""" Calculating the Mean Squared Error to estimate the efficiency of the ANN on TRAINING SET"""
# We are calculating this MSE in two steps. Don't get confused.
y_compare_train['Mean Squared Error'] = (np.diff(y_compare_train.values) ** 2)
y_compare_train['Mean Squared Error'] = np.mean(y_compare_train['Mean Squared Error'])

# Now calculating the Root Mean Squared Error(RMSE)
y_compare_train['Root Mean Squared Error'] = y_compare_train['Mean Squared Error']**0.5

# Calculating Mean Absolute Error
y_compare_train['Mean Abs. Error'] = np.mean(abs(y_compare_train['Actual Values(Training)'] - y_compare_train['Predicted Values(Training)']))

y_compare_train

Unnamed: 0,Actual Values(Training),Predicted Values(Training),Mean Squared Error,Root Mean Squared Error,Mean Abs. Error
0,-0.044648,-0.064204,0.89816,0.947713,0.087366
1,-0.048871,-0.031437,0.89816,0.947713,0.087366
2,-0.057317,-0.031739,0.89816,0.947713,0.087366
3,-0.057317,-0.033673,0.89816,0.947713,0.087366
4,-0.048871,-0.055628,0.89816,0.947713,0.087366
5,-0.048871,-0.030075,0.89816,0.947713,0.087366
6,0.225637,-0.061340,0.89816,0.947713,0.087366
7,-0.048871,-0.015415,0.89816,0.947713,0.087366
8,-0.048871,-0.030711,0.89816,0.947713,0.087366
9,-0.057317,-0.030929,0.89816,0.947713,0.087366
