In [1]:
# Import necessary packages and functions
import pandas as pd
import seaborn as sb
import numpy  as np
import matplotlib.pyplot as plt
%matplotlib notebook

from sklearn.model_selection import train_test_split

# Import breast cancer data
# from sklearn.datasets import load_breast_cancer



# Classification Modeling

## (4) Classification Method Used:
#### K-Nearest Neighbors | Logistic Regression | Support Vector Machines | Decision Tree Classifier

## Data Set Used:
#### (Pima Indians Data set ) https://www.kaggle.com/uciml/pima-indians-diabetes-database/data

#### Read in data set and check for NA values

In [5]:
# Read-in Pima Indians Data set
pima = pd.read_csv('pima-indians-diabetes.csv', header= None, names= ['Preg', 'Glucose', 'BP', 'SkinThickness', 'Insulin', 'BMI', 'DPF', 'Age', 'Outcome'])

# view head of data
print(pima.head())
print()

# View description of data
# print(pima.describe())

# Check for NA values
pima.isnull().any().any()


   Preg  Glucose  BP  SkinThickness  Insulin   BMI    DPF  Age  Outcome
0     6      148  72             35        0  33.6  0.627   50        1
1     1       85  66             29        0  26.6  0.351   31        0
2     8      183  64              0        0  23.3  0.672   32        1
3     1       89  66             23       94  28.1  0.167   21        0
4     0      137  40             35      168  43.1  2.288   33        1



False

#### Convert data set into pandas DataFrame and seperate features from target variable

In [6]:
# Convert data set into pandas DataFrame type
pimadf = pd.DataFrame(pima)

# View data type
# print(type(pimadf))

#### Visualize Data: Feature Relationships | Feature Distributions

In [67]:
# Create a Scatter Plot Matrix of Feature Variables
# sb.pairplot(pimadf[pimadf.columns[:8]])

In [68]:
# Histogram of distributions
# pimadf[pimadf.columns[:8]].hist()

#### Shuffle data and create a Train/Test split

In [7]:
pimadf = pimadf.reindex(np.random.permutation(pimadf.index))
# pimadf.head()

In [8]:
# Seperate features from target variable
X_col_names = ['Preg', 'Glucose', 'BP', 'SkinThickness', 'Insulin', 'BMI', 'DPF', 'Age']
X_pima = pimadf[X_col_names]
y_pima = pimadf['Outcome']

In [9]:
# Split data into train/test set; training set has a 70% split
X_train, X_test, y_train, y_test = train_test_split(X_pima, y_pima,train_size= .7 , random_state=0) # Random_state set to 0 for reproducability

#### Null Accuracy Baseline

In [73]:
# What is our accuracy if we guess by most frequent target variable or 
# guess by the distribution of target variables or
# guess randomly

# Import Dummy Variable Classifier
from sklearn.dummy import DummyClassifier

# Fit dummy classifier with data (* Most frequent)
dummy_majority = DummyClassifier(strategy= 'most_frequent').fit(X_train, y_train)

# Review score
print("Null Accuracy when choosing most frequent target variable:" + " " + str(dummy_majority.score(X_test, y_test)))
print(" ")


# Fit dummy classifier with data (* Target variable distribution)
dummy_majority = DummyClassifier(strategy= 'stratified').fit(X_train, y_train)

# Review score
print("Null Accuracy when choosing target variable by distribution:" + " " + str(dummy_majority.score(X_test, y_test)))
print(" ")

# Fit dummy classifier with data (* Random guess)
dummy_majority = DummyClassifier(strategy= 'uniform').fit(X_train, y_train)

# Review score
print("Null Accuracy when choosing target variables by random:" + " " + str(dummy_majority.score(X_test, y_test)))

Null Accuracy when choosing most frequent target variable: 0.6623376623376623
 
Null Accuracy when choosing target variable by distribution: 0.5454545454545454
 
Null Accuracy when choosing target variables by random: 0.5497835497835498


# K-Nearest Neighbors

In [76]:
# Import KNN classifier
from sklearn.neighbors import KNeighborsClassifier

# Import scaling function from sklearn
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Scale Dependent variables and fit to KNN classifier
X_train_scaled = scaler.fit_transform(X_train)
# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.transform(X_test)

# Instantiate kNN Model and fit training data
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_scaled, y_train)
knn_predict = knn.predict(X_test_scaled)

# Evaluate training set and test set score
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test_scaled, y_test)))


Accuracy of K-NN classifier on training set: 0.82
Accuracy of K-NN classifier on test set: 0.71


#### How sensitive is k-NN classification accuracy to the choice of the 'k' parameter?

In [48]:
k_range = range(1,20)
scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

## Uncomment to view graph    
# plt.figure()
# plt.xlabel('k')
# plt.ylabel('accuracy')
# plt.scatter(k_range, scores)
# plt.xticks([0,5,10,15,20]);

#### KNN Classifier w/ feature scaling and updated n_neighbors value

In [89]:
# Set n_neighbors= 10; fit new knn to data, view accuracies
knn = KNeighborsClassifier(n_neighbors= 10)
knn.fit(X_train_scaled, y_train)

knn_predict = knn.predict(X_test_scaled)

print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test_scaled, y_test)))

Accuracy of K-NN classifier on training set: 0.79
Accuracy of K-NN classifier on test set: 0.73


#### Evaluation

In [98]:
# Perform Confusion Matrix and 5 or 10 Fold-Cross-Validation and Classification Report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Fit Cross-Validation (CV) model with fitted classifier model and full, origional data set (X_pima & y_pima)
cv_scores = cross_val_score(knn, X_pima, y_pima, cv= 5)

print('Cross-Validation score (5-fold): {:.3f}' .format(np.mean(cv_scores)))
print(" ")
print('Confusion Matrix\n', confusion_matrix(y_test, knn.predict(X_test_scaled)))
print(" ")
print("Classification Report:\n", classification_report(y_test, knn_predict))

Cross-Validation score (5-fold): 0.734
 
Confusion Matrix
 [[139  14]
 [ 48  30]]
 
Classification Report:
              precision    recall  f1-score   support

          0       0.74      0.91      0.82       153
          1       0.68      0.38      0.49        78

avg / total       0.72      0.73      0.71       231



## Results:
#### • First, I read in the data, performed summary statistsics and checked for NA values.
#### • Second, I scaled the data by fitting and transforming the X_train set & transforming the X_train set.
#### • Next, I fitted the K-NN model with our training data and evaluated the training and tests set scores.
### ° K-NN model acheived subpar classsification accuracy:
####    Training accuracy= 77%    |  Test accuracy= 74%

# Logistic Regression (Classification)

#### Perform another train/ test split on data

In [99]:
# Split data into train/test set; training set has a 70% split
X_train, X_test, y_train, y_test = train_test_split(X_pima, y_pima,train_size= .7 , random_state=1) # Random_state set to 1 for reproducability

In [100]:
# Import Logistic Regression Model
from sklearn.linear_model import LogisticRegression

# Import scaling function from sklearn and scale data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiate Logistic Model and fit training data
logreg = LogisticRegression(C= 1.0)
logreg.fit(X_train_scaled, y_train)

# Evaluate training set and test set score
print('Accuracy of Logistic Regression classifier (C= 1.0, default) on training set: {:.2f}'
     .format(logreg.score(X_train_scaled, y_train)))
print('Accuracy of Logistic Regression classifier (C= 1.0, default) on test set: {:.2f}'
     .format(logreg.score(X_test_scaled, y_test)))
print(" ")

# Instantiate Logistic Model and fit training data
logreg = LogisticRegression(C= 0.1)
logreg.fit(X_train_scaled, y_train)

# Evaluate training set and test set score
print('Accuracy of Logistic Regression classifier (C= 0.1) on training set: {:.2f}'
     .format(logreg.score(X_train_scaled, y_train)))
print('Accuracy of Logistic Regression classifier (C= 0.1) on test set: {:.2f}'
     .format(logreg.score(X_test_scaled, y_test)))
print(" ")

# Instantiate Logistic Model and fit training data
logreg = LogisticRegression(C= 100)
logreg.fit(X_train_scaled, y_train)

# Evaluate training set and test set score
print('Accuracy of Logistic Regression classifier (C= 100.0) on training set: {:.2f}'
     .format(logreg.score(X_train_scaled, y_train)))
print('Accuracy of Logistic Regression classifier (C= 100.0) on test set: {:.2f}'
     .format(logreg.score(X_test_scaled, y_test)))


# logreg
# C= 1; Train= 77, Test= 76
# C= 0.1; Train= 67, Test= 63
# C= 100; Train= 78, Test= 77


Accuracy of Logistic Regression classifier (C= 1.0, default) on training set: 0.78
Accuracy of Logistic Regression classifier (C= 1.0, default) on test set: 0.77
 
Accuracy of Logistic Regression classifier (C= 0.1) on training set: 0.66
Accuracy of Logistic Regression classifier (C= 0.1) on test set: 0.66
 
Accuracy of Logistic Regression classifier (C= 100.0) on training set: 0.78
Accuracy of Logistic Regression classifier (C= 100.0) on test set: 0.79


#### Evaluation

## Results:
#### • First, I performed another train/ test split on the data set.
#### • Second, I scaled the data by fitting and transforming the X_train set & transforming the X_train set
#### • Next, I fitted the Logistic Regression model with our training data and evaluated the training and tests set scores
#### • Finally, I tested the accuracy of the model in relationship to varying 'C' hyperperamater values.
### ° Highest classification accuracy is acheived when hyperperamater C is set to 100:
####    C= 1.0 (training accuracy= 77%)    |  C= 1.0 (test accuracy= 76%)
####    C= 0.1 (training accuracy= 67%)    |  C= 0.1 (test accuracy= 63%)
####    C= 100.0 (training accuracy= 78%)    |  C= 100.0 (test accuracy= 77%)

# Support Vectore Machines (SVM)

#### Perform another train/ test split on data

In [43]:
# Split data into train/test set; training set has a 70% split
X_train, X_test, y_train, y_test = train_test_split(X_pima, y_pima,train_size= .7 , random_state=2) # Random_state set to 2 for reproducability

In [30]:
# Import Support Vector Machine (SVM) Model
from sklearn.svm import SVC

# Import scaling function from sklearn and scale data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiate SVM Model and fit training data
svm = SVC()
svm.fit(X_train_scaled, y_train)

# Evaluate training set and test set score
print('Accuracy of Support Vector Machine classifier (C= 1.0, default) on training set: {:.2f}'
     .format(svm.score(X_train_scaled, y_train)))
print('Accuracy of Support Vector Machine classifier (C= 1.0, default) on test set: {:.2f}'
     .format(svm.score(X_test_scaled, y_test)))
print(" ")

# Instantiate SVM Model and fit training data (different value for hyperparameter C)
svm = SVC(C= 0.1)
svm.fit(X_train_scaled, y_train)

# Evaluate training set and test set score
print('Accuracy of Support Vector Machine classifier (C= 0.1, default) on training set: {:.2f}'
     .format(svm.score(X_train_scaled, y_train)))
print('Accuracy of Support Vector Machine classifier (C= 0.1, default) on test set: {:.2f}'
     .format(svm.score(X_test_scaled, y_test)))
print(" ")

# Instantiate SVM Model and fit training data (different value for hyperparameter C)
svm = SVC(C= 100)
svm.fit(X_train_scaled, y_train)

# Evaluate training set and test set score
print('Accuracy of Support Vector Machine classifier (C= 100, default) on training set: {:.2f}'
     .format(svm.score(X_train_scaled, y_train)))
print('Accuracy of Support Vector Machine classifier (C= 100, default) on test set: {:.2f}'
     .format(svm.score(X_test_scaled, y_test)))

Accuracy of Support Vector Machine classifier (C= 1.0, default) on training set: 0.77
Accuracy of Support Vector Machine classifier (C= 1.0, default) on test set: 0.73
 
Accuracy of Support Vector Machine classifier (C= 0.1, default) on training set: 0.67
Accuracy of Support Vector Machine classifier (C= 0.1, default) on test set: 0.61
 
Accuracy of Support Vector Machine classifier (C= 100, default) on training set: 0.79
Accuracy of Support Vector Machine classifier (C= 100, default) on test set: 0.76


#### Evaluation

## Results:
#### • First, I performed another train/ test split on the data set.
#### • Second, I scaled the data by fitting and transforming the X_train set & transforming the X_train set
#### • Next, I fitted the Support Vector Machine model with our training data and evaluated the training and tests set scores
#### • Finally, I tested the accuracy of the model in relationship to varying 'C' hyperperamater values.
### ° Highest classification accuracy is acheived when hyperperamater C is set to 100:
####    C= 1.0 (training accuracy= 75%)    |  C= 1.0 (test accuracy= 80%)
####    C= 0.1 (training accuracy= 64%)    |  C= 0.1 (test accuracy= 68%)
####    C= 100.0 (training accuracy= 78%)    |  C= 100.0 (test accuracy= 81%)

# Decision Tree Classifier

#### Perform another train/ test split on data

In [31]:
# Split data into train/test set; training set has a 70% split
X_train, X_test, y_train, y_test = train_test_split(X_pima, y_pima,train_size= .7 , random_state=3) # Random_state set to 3 for reproducability

In [51]:
# Import Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier

# Import scaling function from sklearn
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Scale Dependent variables and fit to KNN classifier
X_train_scaled = scaler.fit_transform(X_train)
# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.transform(X_test)

# Instantiate Decision Tree Model and fit training data (hyperparameter 'max_depth' set= 2)
dtc = DecisionTreeClassifier(max_depth=2)
dtc.fit(X_train, y_train)

# Evaluate training set and test set score
print('Accuracy of K-NN classifier (max_depth= 2) on training set: {:.2f}'
     .format(dtc.score(X_train, y_train)))
print('Accuracy of K-NN classifier (max_depth= 2) on test set: {:.2f}'
     .format(dtc.score(X_test, y_test)))


Accuracy of K-NN classifier (max_depth= 2) on training set: 0.78
Accuracy of K-NN classifier (max_depth= 2) on test set: 0.65


#### Evaluation

## Results:

# *Below is practice code 
# (Attempting to plot accuracies of different hyperperamater values)

In [35]:
# c_values = [0.1, 1.0, 100]
# c_values_len = len(c_values)

# C_range = range(0 ,c_values_len)
# scores = []

# for C in C_range:
#     logreg = LogisticRegression(C= C)
#     logreg.fit(X_train_scaled, y_train)
#     scores.append(logreg.score(X_train_scaled, y_test))

## Uncomment to view graph    
# plt.figure()
# plt.xlabel('C')
# plt.ylabel('accuracy')
# plt.scatter(C_range, scores)
# plt.xticks([0,15,30,45,60,75,90,105]);

In [26]:
#
#
#
#
#
#
#
#
#----------------------- Regression Modeling below... -----------------------------------
#
#
#
#
#
#
#
#
#
#

# Regression Modeling
## (3) Regression Methods Used:
#### Linear Regression | Ridge Regression | Lasso Regression

## Data Set Used:
#### (Combined Cycle Power Plant Data Set) http://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant

#### Read in data set and check for NA values

In [10]:
# Read-in Power Plant data set
energy = pd.read_csv('Folds5x2_pp.csv', skiprows=1, names= ['ave_temp', 'exhaust_vacuum', 'ambient_pressure', 'relative_humidity', 'energy_output'])

# View data head and dim
print(energy.head())
print("")
print('Dimenion of data:' + str(energy.shape))


# Check for NA values
energy.isnull().any().any()


   ave_temp  exhaust_vacuum  ambient_pressure  relative_humidity  \
0      8.34           40.77           1010.84              90.01   
1     23.64           58.49           1011.40              74.20   
2     29.74           56.90           1007.15              41.91   
3     19.07           49.69           1007.22              76.79   
4     11.80           40.66           1017.13              97.20   

   energy_output  
0         480.48  
1         445.75  
2         438.76  
3         453.09  
4         464.43  

Dimenion of data:(9568, 5)


False

#### View statistical description of data set

In [112]:
# View statistical description of data set
# print(energy.describe())
print('')
print('Initial Results:')
print("All of the feature variable's values are of mixed scales. Therefore, we will scale the variables using MinMax Scale")


Initial Results:
All of the feature variable's values are of mixed scales. Therefore, we will scale the variables using MinMax Scale


### *Visualizations* -  Distribution (Histogram) | Relationships (Scatterplot)

In [76]:
# View data's distribution
# print(energy.hist())

In [50]:
# Views data's relationships between variables
# pairs = sb.pairplot(energy)

In [56]:
# Scatterplot between energy_output and average_temp
# scatter_TempAge = sb.regplot(x= 'energy_output', y= 'ave_temp', data= energy)

# Scatterplot between exhaust vacuum and average_temp    
# scatter_ExhaustAge = sb.regplot(x= 'exhaust vacuum', y= 'ave_temp', data= energy)

# Scatterplot between relative humidity and average_temp    
# scatter_ExhaustAge = sb.regplot(x= 'relative_humidity', y= 'ave_temp', data= energy)


#### Save the two most promosing variables 
#### (* Reshape the data in order for it to work with Linear Model | ave_temp= (9568, 1) & energy_output= (9568,)* )

In [11]:
# energy_output vs ave_tempt looks to have a strong relationship
# Save energy_output & ave_tempt in their own objects 

ave_temp = energy[energy.columns[0]]
energy_output = energy[energy.columns[4]]
# ave_temp
# energy_output



## Reshape the data in order for it to work with Linear Model | ave_temp= (9568, 1) & energy_output= (9568,)
## *Ignore any potential error messages
ave_temp = ave_temp.reshape(-1, 1)
# ave_temp.shape
# energy_output.shape



#### Split data into train/test sets

In [12]:
# # Shuffle data
# X_energy = ave_temp.reindex(np.random.permutation(ave_temp.index))
# y_energy = energy_output.reindex(np.random.permutation(energy_output.index))

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(ave_temp, energy_output,train_size= .7, random_state= 0)

#### Null Accuracy Baseline

In [16]:
# What is our accuracy if we split our data by its mean 

# Import Dummy Variable Classifier
from sklearn.dummy import DummyRegressor
# Import model evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score

# Fit dummy classifier with data (* Most frequent)
dummy_majority = DummyRegressor(strategy= 'mean').fit(X_train, y_train)

y_predict_dummy_mean = dummy_majority.predict(X_test)

# Review Mean Squared Error
print("Mean Squared Error when applying Line-of-best-fit by the mean of data:" + " " + str(mean_squared_error(y_test, y_predict_dummy_mean)))
print(" ")

# Review r2 score
print("r2 score when applying Line-of-best-fit by the mean of data:" + " " + str(r2_score(y_test, y_predict_dummy_mean)))
print(" ")

# # Fit dummy classifier with data (* Target variable distribution)
# dummy_majority = DummyClassifier(strategy= 'stratified').fit(X_train, y_train)

# # Review score
# print("Null Accuracy when choosing target variable by distribution:" + " " + str(dummy_majority.score(X_test, y_test)))
# print(" ")

# # Fit dummy classifier with data (* Random guess)
# dummy_majority = DummyClassifier(strategy= 'uniform').fit(X_train, y_train)

# # Review score
# print("Null Accuracy when choosing target variables by random:" + " " + str(dummy_majority.score(X_test, y_test)))

Mean Squared Error when applying Line-of-best-fit by the mean of data: 294.96782677101066
 
r2 score when applying Line-of-best-fit by the mean of data: -0.012572169246374276
 


## Linear Regression *WITHOUT* feature scaling
#### *Only using the variables ave_temp and energy_output*

In [23]:
# Import Regression model
from sklearn.linear_model import LinearRegression

# fit scaled data to linear model
linreg = LinearRegression().fit(X_train, y_train)

y_predict_linreg = linreg.predict(X_test)

print('Energy dataset')
print('linear regression linear model intercept: {}'
     .format(linreg.intercept_))
print('linear regression linear model coeff:\n{}'
     .format(linreg.coef_))
print(" ")
print('R-squared score (test): {:.3f}'
     .format(r2_score(y_test, y_predict_linreg)))
print('Mean Squared Error score (test): {:.3f}'
     .format(mean_squared_error(y_test, y_predict_linreg)))
print(" ")
print('Number of non-zero features: {}'
     .format(np.sum(linreg.coef_ != 0)))

Energy dataset
linear regression linear model intercept: 257.0218157243215
linear regression linear model coeff:
[-1.61063462 -0.38009644  0.25642869 -0.13453288]
 
R-squared score (test): 0.922
Mean Squared Error score (test): 22.739
 
Number of non-zero features: 4


## Linear Regression *WITH*  feature scaling
#### *Only using the variables ave_temp and energy_output*

In [117]:
# Scale data feature values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [132]:
# Import Regression model
# from sklearn.linear_model import LinearRegression

# fit scaled data to linear model
linreg = LinearRegression().fit(X_train_scaled, y_train)

print('Energy dataset')
print('linear regression linear model intercept: {}'
     .format(linreg.intercept_))
print('linear regression linear model coeff:\n{}'
     .format(linreg.coef_))
print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'
     .format(np.sum(linreg.coef_ != 0)))

Energy dataset
linear regression linear model intercept: 493.16265635905836
linear regression linear model coeff:
[-76.76543094]
R-squared score (training): 0.898
R-squared score (test): 0.901
Number of non-zero features: 1


## Results:
#### • First, I read in the data set, checked for NA values, and did descriptive statistics.
#### • Second, I visualized the data ( Scatterplot & Histogram)
#### • Next, I took the variable (ave_tempt) that had a strong negative relationship w/ or target variable and save it in its own object
#### • I performed a train/test split on the data and fit to a Linear Regression Model
### ° ave_tempt showed to be a decent predictor for energy_output:
####   -  R-squared score (training): 90%   |  R-squared score (test): 90%



# Below is work-in-progress
## I believe the following code is to be used with Ridge regression and/or 'logistic' regression
#### estimatedCoefficients of each feature variable compared to 'outcome'

In [130]:
# a = pd.DataFrame(ave_temp.columns, columns= ['features'])
# # a
# a["estimatedCoefficients"] = linreg.coef_
# print(a)

## Ridge Regression

In [157]:
# View origional data set
# energy.head()

#### Seperate features from target variable

In [76]:
# Seperate features from target variable
X_col_names = ['ave_temp', 'exhaust_vacuum', 'ambient_pressure', 'relative_humidity']
X_energy = energy[X_col_names]
y_energy = energy['energy_output']

#### Split data into train/test set

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X_energy, y_energy, train_size= 70, random_state= 0)

#### * Ridge Regression using all feature variables | feature values scaled*

In [78]:
# For Ridge Regression, feature scaling/ normalizing is necessary 
# Therefore, we will normalize the data set before performing Ridge Regression

# Import evaluation metrics
from sklearn.metrics import r2_score, mean_squared_error

# Import MinMaxScaler and normalize data set
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Import Ridge Regression Model and fit scaled data to model 
from sklearn.linear_model import Ridge

linridge = Ridge(alpha= 20.0).fit(X_train_scaled, y_train)

y_predict_linridge = linridge.predict(X_test_scaled)

# View training set and test set accuracies
print('Energy dataset')
print(" ")
print('ridge regression linear model intercept: {}'
     .format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linridge.coef_))
print(" ")
print('R-squared score (test): {:.3f}'
     .format(r2_score(y_test, y_predict_linridge)))
print('MeanSquaredError score (test): {:.3f}'
     .format(mean_squared_error(y_test, y_predict_linridge)))
print(" ")
print('Number of non-zero features: {}'
     .format(np.sum(linridge.coef_ != 0)))



# print('R-squared score (test): {:.3f}'
#      .format(linridge.score(X_train_scaled, y_train)))
# print('R-squared score (test): {:.3f}'
#      .format(linridge.score(X_test_scaled, y_test)))

Energy dataset
 
ridge regression linear model intercept: 457.70969440850405
ridge regression linear model coeff:
[-9.06965079 -9.51838016  4.70408301  4.50077641]
 
R-squared score (test): 0.555
0.5551450441564407
MeanSquaredError score (test): 129.589
 
Number of non-zero features: 4


#### Ridge regression with regularization parameter: alpha

In [64]:
print('Ridge regression: effect of alpha regularization parameter\n')
for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:
    linridge = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train)
    r2_train = linridge.score(X_train_scaled, y_train)
    r2_test = linridge.score(X_test_scaled, y_test)
    num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0)
    print('Alpha = {:.2f}\nnum abs(coeff) > 1.0: {}, \
r-squared training: {:.2f}, r-squared test: {:.2f}\n'
         .format(this_alpha, num_coeff_bigger, r2_train, r2_test))

#### Alternative GridSearch Method

In [70]:
# Import GridSearch
from sklearn.model_selection import GridSearchCV
linridge = Ridge()
grid_values = {'alpha': [0, 1, 10, 20, 50, 100, 1000]}

grid_linridge_acc = GridSearchCV(linridge, param_grid= grid_values)
grid_linridge_acc.fit(X_train_scaled, y_train)

# Print best paramater value and its accuracy
print(grid_linridge_acc.best_score_)
print(grid_linridge_acc.best_estimator_)

0.9162179415120615
Ridge(alpha=0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


#### Re-run Ridge Regression with Alpha value that achieves highest r-squared score = 0

In [60]:
linridge = Ridge(alpha= 0.0).fit(X_train_scaled, y_train)
y_predict_linridge = linridge.predict(X_test_scaled)

# View training set and test set accuracies
print('Energy dataset')
print(" ")
print('ridge regression linear model intercept: {}'
     .format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linridge.coef_))
print(" ")
print('R-squared score (train): {:.3f}'
     .format(linridge.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linridge.score(X_test_scaled, y_test)))
print('MeanSquaredError score (test): {:.3f}'
     .format(mean_squared_error(y_test, y_predict_linridge)))
print(" ")
print('Number of non-zero features: {}'
     .format(np.sum(linridge.coef_ != 0)))


Energy dataset
 
ridge regression linear model intercept: 484.7146911030029
ridge regression linear model coeff:
[-43.92200614 -14.9073823    7.40822499  -7.75716593]
 
R-squared score (train): 0.935
R-squared score (test): 0.922
MeanSquaredError score (test): 22.739
 
Number of non-zero features: 4


## Results:
#### • First, I seperated feature variables from target variable
#### • Second, I performed a train/test split on the data and fit to a Ridge Regression Model | hyperperamater 'Alpha' set= 20
### ° Initial results- train/ test set accuracies are lower than Linear Model's accuracies:
####   -  R-squared score (training): 60%   |  R-squared score (test): 55%
#### • Next, I examined the model's accuracies with varying values for hyperperamater 'Alpha'
#### •  An 'Alpha' value of 0 showed to achieve highest accuracy | Re-run model with Alpha= 0
### ° Secondary Results- With hyperperameter set= 0, Ridge Regression achieved promising accuracies:
#### -  R-squared score (training): 93%   |  R-squared score (test): 92%

# Lasso Regression

In [None]:
# View origional data set
# energy.head()

#### Seperate features from target variable

In [61]:
# Seperate features from target variable
X_col_names = ['ave_temp', 'exhaust_vacuum', 'ambient_pressure', 'relative_humidity']
X_energy = energy[X_col_names]
y_energy = energy['energy_output']

#### Split data into train/test set

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_energy, y_energy, train_size= 70, random_state= 0)

#### * Lasso Regression using all feature variables | feature values scaled*

In [63]:
# For Lasso Regression, feature scaling/ normalizing is necessary 
# Therefore, we will normalize the data set before performing Ridge Regression

# Import MinMaxScaler and normalize data set
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Import Ridge Regression Model and fit scaled data to model 
from sklearn.linear_model import Lasso

linlasso = Lasso(alpha= 0.5, max_iter = 10000).fit(X_train_scaled, y_train)
y_predict_linlasso = linlasso.predict(X_test_scaled)

print('Energy dataset')
print(" ")
print('ridge regression linear model intercept: {}'
     .format(linlasso.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linlasso.coef_))
print(" ")
print('R-squared score (train): {:.3f}'
     .format(linlasso.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linlasso.score(X_test_scaled, y_test)))
print('MeanSquaredError score (test): {:.3f}'
     .format(mean_squared_error(y_test, y_predict_linlasso)))
print(" ")
print('Features with non-zero weight (sorted by absolute magnitude):')

for e in sorted (list(zip(list(X_energy), linlasso.coef_)),
                key = lambda e: -abs(e[1])):
    if e[1] != 0:
        print('\t{}, {:.3f}'.format(e[0], e[1]))

Energy dataset
 
ridge regression linear model intercept: 479.87677209703963
ridge regression linear model coeff:
[-33.27633412 -17.32755484   0.           0.        ]
 
R-squared score (train): 0.908
R-squared score (test): 0.894
MeanSquaredError score (test): 30.981
 
Features with non-zero weight (sorted by absolute magnitude):
	ave_temp, -33.276
	exhaust_vacuum, -17.328


## Results:
#### • First, I seperated feature variables from target variable
#### • Second, I performed a train/test split on the data and fit to a Lasso Regression Model | hyperperamater 'Alpha' set= 1
### ° Initial results- train/ test set accuracies are higher than Linear Regression but lower than Ridge Model's accuracies:
####   -  R-squared score (training): 90%   |  R-squared score (test): 89%
####   - (2) feature variables showed to have non-zero values for our model: ave_temp & exhaust_vacuum
####   - Both non-zero features have a negative correlation with our dependent variable energy_output

## Polynomial regression

In [48]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures


X_train, X_test, y_train, y_test = train_test_split(X_energy, y_energy, random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('linear model coeff (w): {}'
     .format(linreg.coef_))
print('linear model intercept (b): {:.3f}'
     .format(linreg.intercept_))
print(" ")
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))
print('MeanSquaredError score (test): {:.3f}'
     .format(mean_squared_error(y_test, linreg.predict(X_test))))

print('\nNow we transform the original input data to add\n\
polynomial features up to degree 2 (quadratic)\n')
poly = PolynomialFeatures(degree=2)
X_energy_poly = poly.fit_transform(X_energy)

X_train, X_test, y_train, y_test = train_test_split(X_energy_poly, y_energy, random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)
y_predict_linreg = linreg.predict(X_test)

print('(poly deg 2) linear model coeff (w):\n{}'
     .format(linreg.coef_))
print('(poly deg 2) linear model intercept (b): {:.3f}'
     .format(linreg.intercept_))
print(" ")
print('(poly deg 2) R-squared score (train): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('(poly deg 2) R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))
print('(poly deg 2) MeanSquaredError score (test): {:.3f}\n'
     .format(mean_squared_error(y_test, y_predict_linreg)))

print('\nAddition of many polynomial features often leads to\n\
overfitting, so we often use polynomial features in combination\n\
with regression that has a regularization penalty, like ridge\n\
regression.\n')

X_train, X_test, y_train, y_test = train_test_split(X_energy_poly, y_energy, random_state = 0)
linridge = Ridge().fit(X_train, y_train)
y_predict_linridge = linridge.predict(X_test)

print('(poly deg 2 + ridge) linear model coeff (w):\n{}'
     .format(linridge.coef_))
print('(poly deg 2 + ridge) linear model intercept (b): {:.3f}'
     .format(linridge.intercept_))
print(" ")
print('(poly deg 2 + ridge) R-squared score (train): {:.3f}'
     .format(linridge.score(X_train, y_train)))
print('(poly deg 2) R-squared score (test): {:.3f}'
     .format(linridge.score(X_test, y_test)))
print('(poly deg 2 + ridge) MeanSqaureError score (test): {:.3f}'
     .format(mean_squared_error(y_test, y_predict_linridge)))

linear model coeff (w): [-1.98357941 -0.23219575  0.06559288 -0.15932893]
linear model intercept (b): 451.191
 
R-squared score (test): 0.929
MeanSquaredError score (test): 20.544

Now we transform the original input data to add
polynomial features up to degree 2 (quadratic)

(poly deg 2) linear model coeff (w):
[ 0.00000000e+00 -5.86827173e+00 -2.95152189e+00  1.56380646e+01
  4.09323406e+00  1.59324371e-02  1.26142482e-02  3.22997588e-03
 -7.06194295e-03 -1.36728979e-03  2.45140439e-03  9.48825801e-04
 -7.61221143e-03 -3.76971757e-03 -2.10743929e-03]
(poly deg 2) linear model intercept (b): -7518.264
 
(poly deg 2) R-squared score (train): 0.938
(poly deg 2) R-squared score (test): 0.938
(poly deg 2) MeanSquaredError score (test): 18.034


Addition of many polynomial features often leads to
overfitting, so we often use polynomial features in combination
with regression that has a regularization penalty, like ridge
regression.

(poly deg 2 + ridge) linear model coeff (w):
[ 0.00000000

# Next, I will perform Predictions on test set to see how well our models
# generalize to new data