In [1]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pylab as plt
# !pip install dmba
from dmba import classificationSummary


# Neural Networks

## 1. Preprocess the data

### a. Create categorical and dummy variables where appropriate.

In [2]:
toyota = pd.read_csv('ToyotaCorolla.csv')

toyota.Model = toyota.Model.astype('category')
toyota.Fuel_Type = toyota.Fuel_Type.astype('category')
toyota.Color = toyota.Color.astype('category')

toyota = toyota.drop(columns=['Id','Model'])

# toyota = pd.get_dummies(data=toyota, columns=["Mfr_Guarantee", "BOVAG_Guarantee", 
#                                      'ABS','Airbag_1','Airbag_2','Airco','Automatic_airco','Boardcomputer',
#                                      'CD_Player','Central_Lock','Powered_Windows','Power_Steering','Radio',
#                                      'Mistlamps','Sport_Model','Backseat_Divider','Metallic_Rim','Radio_cassette',
#                                      'Parking_Assistant','Tow_Bar'], drop_first=True)


In [3]:
toyota = pd.get_dummies(data=toyota, drop_first=True)
toyota.head()

Unnamed: 0,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,CC,Doors,...,Fuel_Type_Petrol,Color_Black,Color_Blue,Color_Green,Color_Grey,Color_Red,Color_Silver,Color_Violet,Color_White,Color_Yellow
0,13500,23,10,2002,46986,90,1,0,2000,3,...,0,0,1,0,0,0,0,0,0,0
1,13750,23,10,2002,72937,90,1,0,2000,3,...,0,0,0,0,0,0,1,0,0,0
2,13950,24,9,2002,41711,90,1,0,2000,3,...,0,0,1,0,0,0,0,0,0,0
3,14950,26,7,2002,48000,90,0,0,2000,3,...,0,1,0,0,0,0,0,0,0,0
4,13750,30,3,2002,38500,90,0,0,2000,3,...,0,1,0,0,0,0,0,0,0,0


### b. Scale the data using MinMaxScaler().

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [5]:
scaler.fit(toyota)

MinMaxScaler()

### c.	Partition the data into X & y data frames and train_test_split.

In [6]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(toyota, test_size=0.4, random_state=1)

In [7]:
X_train = train_df[['Age_08_04', 'KM', 'Fuel_Type_Diesel','Fuel_Type_Petrol', 'HP', 'Automatic',
                            'Doors', 'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco',
                            'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar']]
y_train = train_df['Price']


X_valid = valid_df[['Age_08_04', 'KM', 'Fuel_Type_Diesel','Fuel_Type_Petrol', 'HP', 'Automatic',
                            'Doors', 'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco',
                            'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar']]
y_valid = valid_df['Price']

## 2.	Fit a neural network model to the data. Use a single hidden layer with two nodes.


In [8]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(activation='logistic',solver='lbfgs',hidden_layer_sizes=(2,),random_state=1)
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='logistic', hidden_layer_sizes=(2,), random_state=1,
              solver='lbfgs')

## 3. 	Calculate the RMSE for the training and validation data.

In [9]:
from sklearn.metrics import mean_squared_error

y_train_pred = clf.predict(X_train)
mean_squared_error(y_train, y_train_pred, squared=False)

3328.5162047728627

In [10]:
y_valid_pred = clf.predict(X_valid)
mean_squared_error(y_valid, y_valid_pred, squared=False)

3108.1143597112828

## 4.	Repeat the process (steps 2 and 3) for a single hidden layer with 5 nodes.  How does the RMSE change for the training and validation data?

In [11]:
clf2 = MLPClassifier(activation='logistic',solver='lbfgs',hidden_layer_sizes=(5,),random_state=1)
clf2.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='logistic', hidden_layer_sizes=(5,), random_state=1,
              solver='lbfgs')

In [12]:
y_train_pred2 = clf2.predict(X_train)
mean_squared_error(y_train, y_train_pred2, squared=False)

3335.240128777626

In [13]:
y_valid_pred2 = clf2.predict(X_valid)
mean_squared_error(y_valid, y_valid_pred2, squared=False)

3102.3001360140906

# Discriminant Analysis

## 1.	Partition the data into training and validation sets, then perform a discriminant analysis on the training data using only the following predictors: 
predictors = ['our', 'C!', 'hpl', 'free', 'hp', 'your', 'you ', 'george', 'CAP_avg', 'CAP_long', 'CAP_tot'] 
(Note: You will need to determine how to fit the model on the train_test_split data, but in class we fit on the entire dataset.

In [14]:
spam = pd.read_csv("Spambase.csv")
spam.head()

Unnamed: 0,make,address,all,W_3d,our,over,remove,internet,order,mail,...,C;,C(,C[,C!,C$,C#,CAP_avg,CAP_long,CAP_tot,Spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [15]:
predictors = ['our', 'C!', 'hpl', 'free', 'hp', 'your', 'you ', 'george', 'CAP_avg', 'CAP_long', 'CAP_tot']
train_df, valid_df = train_test_split(spam, test_size=0.4, random_state=1)

In [16]:
X_train = train_df[predictors]
y_train = train_df['Spam']

X_valid = valid_df[predictors]
y_valid = valid_df['Spam']

In [17]:
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)

LinearDiscriminantAnalysis()

## 2. If we are interested mainly in detecting spam messages, is this model useful? Use the confusion matrix. 

In [18]:
classificationSummary(y_valid, clf.predict(X_valid))

Confusion Matrix (Accuracy 0.8332)

       Prediction
Actual    0    1
     0 1030   68
     1  239  504


**The model is quite useful becuase it has an accuracy of ~83%.**

## 3. In the sample, almost 40% of the e-mail messages were tagged as spam. However, suppose that the actual proportion of spam messages in these e-mail accounts is 10%. Perform the discriminant analysis using these prior probabilities.

In [19]:
priors = np.array([0.9, 0.1])

In [20]:
clf2 = LinearDiscriminantAnalysis(priors=priors)
clf2.fit(X_train, y_train)

LinearDiscriminantAnalysis(priors=array([0.9, 0.1]))