In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load dataset to a pandas DataFrame
path_to_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
column_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'accep']
df = pd.read_csv(path_to_data, names=column_names)

target_column = 'accep'
raw_feature_columns = [col for col in column_names if col != target_column]

# Create dummy variables from the feature columns
X = pd.get_dummies(df[raw_feature_columns], drop_first=True)

# Convert target column to binary variable; 0 if 'unacc', 1 otherwise
df[target_column] = np.where(df[target_column] == 'unacc', 0, 1)
y = df[target_column]

# Split the full dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.3)


# 1. Create a decision stump base model using the Decision Tree Classifier and print its parameters
decision_stump = DecisionTreeClassifier(max_depth = 1)
print(decision_stump.get_params())
# 2. Create an Adaptive Boost Classifier and print its parameters
# ada_classifier = AdaBoostClassifier(base_estimator = decision_stump, n_estimators =5 )
ada_classifier = AdaBoostClassifier(n_estimators =5 )

print(ada_classifier.get_params())

# 3. Fit the Adaptive Boost Classifier to the training data and get the list of predictions
ada_classifier.fit(X_train,y_train)
y_pred = ada_classifier.predict(X_test)

# 4. Calculate the accuracy, precision, recall, and f1-score on the testing data
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print(f'Test set accuracy:\t{accuracy}')
print(f'Test set precision:\t{precision}')
print(f'Test set recall:\t{recall}')
print(f'Test set f1-score:\t{f1}')

# 5. Remove the comments from the following code block to print the confusion matrix
test_conf_matrix = pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=[1, 0]), 
    index=['actual yes', 'actual no'], 
    columns=['predicted yes', 'predicted no']
)
print(f'Confusion Matrix:\n{test_conf_matrix.to_string()}')

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}
{'algorithm': 'deprecated', 'estimator': None, 'learning_rate': 1.0, 'n_estimators': 5, 'random_state': None}
Test set accuracy:	0.7957610789980732
Test set precision:	0.7264150943396226
Test set recall:	0.5
Test set f1-score:	0.5923076923076923
Confusion Matrix:
            predicted yes  predicted no
actual yes             77            77
actual no              29           336


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load dataset to a pandas DataFrame
path_to_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
column_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'accep']

df = pd.read_csv(path_to_data, names=column_names)
target_column = 'accep'
raw_feature_columns = [col for col in column_names if col != target_column]

# Create dummy variables from the feature columns
X = pd.get_dummies(df[raw_feature_columns], drop_first=True)

# Convert target column to binary variable; 0 if 'unacc', 1 otherwise
df[target_column] = np.where(df[target_column] == 'unacc', 0, 1)
y = df[target_column]

# Split the full dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.3)

# 1. Create a Gradient Boosting Classifier and print its parameters
grad_classifier = GradientBoostingClassifier(n_estimators=15)
print(grad_classifier.get_params())
# 2. Fit the Gradient Boosted Trees Classifier to the training data and get the list of predictions
grad_classifier.fit(X_train,y_train)
y_pred = grad_classifier.predict(X_test)
# 3. Calculate the accuracy, precision, recall, and f1-score on the testing data
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
print(f'Test set accuracy:\t{accuracy}')
print(f'Test set precision:\t{precision}')
print(f'Test set recall:\t{recall}')
print(f'Test set f1-score:\t{f1}')

# 4. Remove the comments from the following code block to print the confusion matrix

test_conf_matrix = pd.DataFrame(
   confusion_matrix(y_test, y_pred, labels=[1, 0]), 
   index=['actual yes', 'actual no'], 
   columns=['predicted yes', 'predicted no']
)

print(f'Confusion Matrix:\n{test_conf_matrix.to_string()}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 15, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
Test set accuracy:	0.8978805394990366
Test set precision:	0.7885714285714286
Test set recall:	0.8961038961038961
Test set f1-score:	0.8389057750759878
Confusion Matrix:
            predicted yes  predicted no
actual yes            138            16
actual no              37           328


In [5]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)

y = df.age
X=df.drop(columns=['age'])
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns
#create some missing values
for i in range(1000):
    X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)

#1. Create a pipeline `num_vals` to process numerical data

num_vals = Pipeline([("imputer",SimpleImputer()), ("scale",StandardScaler())])

#2. Create a pipeline `cat_vals` to process categorical data
cat_vals = Pipeline([("imputer",SimpleImputer(strategy = 'most_frequent')), ("ohe",OneHotEncoder(drop = 'first'))])


#3. Create a column transformer, `preprocess` with the numerical and categorical pipelines
preprocess = ColumnTransformer(
    transformers=[
        ("num_preprocess", num_vals, num_cols),
        ("cat_preprocess", cat_vals, cat_cols)      
    ]
)


#4. Fit the preprocess transformer to training data
preprocess.fit(x_train)
#Transform the test data
x_transform = preprocess.transform(x_test)






In [7]:
import numpy as np
import pandas as pd

from sklearn import svm, datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)

y = df.age
X=df.drop(columns=['age'])
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns
#create some missing values
for i in range(1000):
    X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)

cat_vals = Pipeline([("imputer",SimpleImputer(strategy='most_frequent')), ("ohe",OneHotEncoder( drop='first'))])

num_vals = Pipeline([("imputer",SimpleImputer(strategy='mean')), ("scale",StandardScaler())])

preprocess = ColumnTransformer(
    transformers=[
        ("cat_process", cat_vals, cat_cols),
        ("num_process", num_vals, num_cols)
    ]
)
#1. Create a pipeline with `preprocess` and a linear regression model, `regr`
pipeline = Pipeline([("preprocess",preprocess), 
                     ("regr",LinearRegression())])

#2. Fit the pipeline on the training data and predict on the test data

pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)

#3. Calculate pipeline score and compare to estimator score
#Pipeline score
pipeline_score = pipeline.score(x_test,y_test)
print(pipeline_score)

#r-squared score
r2_score = r2_score(y_test, y_pred)
print(r2_score)


0.49707008235213357
0.49707008235213357


In [8]:
import numpy as np
import pandas as pd

from sklearn import svm, datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn import metrics

columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)

y = df.age
X=df.drop(columns=['age'])
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns
#create some missing values
for i in range(1000):
    X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)

cat_vals = Pipeline([("imputer",SimpleImputer(strategy='most_frequent')), ("ohe",OneHotEncoder(drop='first'))])
num_vals = Pipeline([("imputer",SimpleImputer(strategy='mean')), ("scale",StandardScaler())])

preprocess = ColumnTransformer(
    transformers=[
        ("cat_process", cat_vals, cat_cols),
        ("num_process", num_vals, num_cols)
    ]
)

#Create a pipeline with pregrocess and a linear regression model
pipeline = Pipeline([("preprocess",preprocess), 
                     ("regr",LinearRegression())])

#Very simple parameter grid, with and without the intercept
param_grid = {
    "regr__fit_intercept": [True,False]
}

#------------------------------------------------
#1. Grid search using previous pipeline
gs = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=5)

#2. fit grid using training data and print best score
gs.fit(x_train, y_train)
best_score = gs.best_score_
best_params = gs.best_params_
print(best_score, best_params)




-5.418881902852317 {'regr__fit_intercept': True}


In [10]:
import numpy as np
import pandas as pd

from sklearn import svm, datasets
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn import metrics

columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)

y = df.age
X=df.drop(columns=['age'])
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns
#create some missing values
for i in range(1000):
    X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)

cat_vals = Pipeline([("imputer",SimpleImputer(strategy='most_frequent')), ("ohe",OneHotEncoder(drop='first'))])
num_vals = Pipeline([("imputer",SimpleImputer(strategy='mean')), ("scale",StandardScaler())])

preprocess = ColumnTransformer(
    transformers=[
        ("cat_preprocess", cat_vals, cat_cols),
        ("num_preprocess", num_vals, num_cols)
    ]
)
#Create a pipeline with preprocess and a linear regression model
pipeline = Pipeline([("preprocess",preprocess), 
                     ("regr",LinearRegression())])

#--------------------------------------
# 1. Update the `search_space` array from the narrative to add a Lasso Regression model as the third dictionary.
search_space = [{'regr': [LinearRegression()], 'regr__fit_intercept': [True,False]},
                {'regr':[Ridge()],
                     'regr__alpha': [0.01,0.1,1,10,100]},
                {'regr':[Lasso()],
                     'regr__alpha': [0.01,0.1,1,10,100]}]


# 2.  Initialize a grid search on `search_space`
gs = GridSearchCV(pipeline, search_space, scoring='neg_mean_squared_error', cv=5)

#3. Find the best pipeline, regression model and its hyperparameters

## i. Fit to training data
gs.fit(x_train, y_train)

## ii. Find the best pipeline
best_pipeline = gs.best_estimator_

## iii. Find the best regression model
best_regression_model = best_pipeline.named_steps['regr']
print('The best regression model is:')
print(best_regression_model)

## iv. Find the hyperparameters of the best regression model
best_model_hyperparameters = best_regression_model.get_params()
print('The hyperparameters of the regression model are:')
print(best_model_hyperparameters)

#4. Access the hyperparameters of the categorical preprocessing step
cat_preprocess_hyperparameters = best_pipeline.named_steps['preprocess'].named_transformers_['cat_preprocess'].named_steps['imputer'].get_params()
print('The hyperparameters of the imputer are:')
print(cat_preprocess_hyperparameters)


The best regression model is:
Lasso(alpha=0.01)
The hyperparameters of the regression model are:
{'alpha': 0.01, 'copy_X': True, 'fit_intercept': True, 'max_iter': 1000, 'positive': False, 'precompute': False, 'random_state': None, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': False}
The hyperparameters of the imputer are:
{'add_indicator': False, 'copy': True, 'fill_value': None, 'keep_empty_features': False, 'missing_values': nan, 'strategy': 'most_frequent'}
