In [None]:
# Part 1 - Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


np.random.seed(0)
n = 15
x = np.linspace(0,10,n) + np.random.randn(n)/5
y = np.sin(x)+x/6 + np.random.randn(n)/10


X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

In [None]:
# Question 1
# Write a function that fits a polynomial LinearRegression model on the training data X_train for degrees 1, 3, 6, and 9.
# (Use PolynomialFeatures in sklearn.preprocessing to create the polynomial features and then fit a linear regression model) 
# For each model, find 100 predicted values over the interval x = 0 to 10 (e.g. np.linspace(0,10,100)) and store this in 
# a numpy array. The first row of this array should correspond to the output from the model trained on degree 1, the second 
# row degree 3, the third row degree 6, and the fourth row degree 9.



In [None]:
def answer_one():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    predicted_values=np.zeros((4,100))
    
    n=[1,3,6,9]
    
    for count,degree in enumerate(n,0):
        
        polynom=PolynomialFeatures(degree=degree)
        
        X_polynom=polynom.fit_transform(X_train.reshape(-1,1))
        
        polyReg=LinearRegression()
        
        polyReg.fit(X_polynom,y_train)
        
        y_predict=polyReg.predict(polynom.fit_transform(np.linspace(0,10,100).reshape(-1,1)))
        
        predicted_values[count,:]=y_predict
    
    return predicted_values 

print(answer_one())

In [None]:
# Question 2
# Write a function that fits a polynomial LinearRegression model on the training data X_train for degrees 0 through 9. 
# For each model compute the  R2R2  (coefficient of determination) regression score on the training data as well as the 
# the test data, and return both of these arrays in a tuple.

# This function should return one tuple of numpy arrays (r2_train, r2_test). Both arrays should have shape (10,)

In [None]:
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score
    
    R2_test=np.zeros(10)
    
    R2_train=np.zeros(10)
    
    for i in range(10):
        
        polynom_1=PolynomialFeatures(degree=i)
        
        X_polynom_1=polynom_1.fit_transform(X_train.reshape(-1,1))
        
        X_test_polynom_1=polynom_1.fit_transform(X_test.reshape(-1,1))
        
        PolyReg_1=LinearRegression().fit(X_polynom_1,y_train)
        
        R2_train[i]=PolyReg_1.score( X_polynom_1,y_train)
        
        
        R2_test[i]=PolyReg_1.score(X_test_polynom_1,y_test)
        
    return (R2_train,R2_test)

print(answer_two())

In [None]:
# Question 3
# Based on the  R2R2  scores from question 2 (degree levels 0 through 9), what degree level corresponds to a model that is 
# underfitting? What degree level corresponds to a model that is overfitting? What choice of degree level would provide a 
# model with good generalization performance on this dataset?

# Hint: Try plotting the  R2R2  scores from question 2 to visualize the relationship between degree level and  R2R2 . Remember 
# to comment out the import matplotlib line before submission.

# This function should return one tuple with the degree values in this order: (Underfitting, Overfitting, Good_Generalization). 
# There might be multiple correct solutions, however, you only need to return one possible solution, for example, (1,2,3).

In [None]:
def answer_three():
    
    R2_Score=answer_two()
    
    data={'R2_train':R2_Score[0],'R2_test':R2_Score[1]}

    df=pd.DataFrame(data)

    df['Score_diff']=df['R2_train']-df['R2_test']

    df_2=df.sort_values(by=['R2_train'])

    Underfitting=df_2.index[0]

    df_3=df.sort_values(by=['Score_diff'])

    Overfitting=df_3.index[-1]

    Good_Generalization=df_3.index[0]
    
    return (Underfitting, Overfitting, Good_Generalization)

print(answer_three())


In [None]:
# Question 4
# Training models on high degree polynomial features can result in overly complex models that overfit, so we often use 
# regularized versions of the model to constrain model complexity, as we saw with Ridge and Lasso linear regression.

# For this question, train two models: a non-regularized LinearRegression model (default parameters) and a regularized 
# Lasso Regression model (with parameters alpha=0.01, max_iter=10000) both on polynomial features of degree 12. Return the  
# R2R2  score for both the LinearRegression and Lasso model's test sets.

# This function should return one tuple (LinearRegression_R2_test_score, Lasso_R2_test_score)

In [None]:
def answer_four():
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import Lasso, LinearRegression
    from sklearn.metrics.regression import r2_score

    poly_feature = PolynomialFeatures(degree=12)
    
    X_train_5 = poly_feature .fit_transform(X_train.reshape(-1,1))
    
    X_test_5 = poly_feature .fit_transform(X_test.reshape(-1,1))
    
    lineReg=LinearRegression().fit(X_train_5,y_train)

    LinearRegression_R2_test_score=lineReg.score(X_test_5, y_test)

    linelasso=Lasso(alpha=0.01,max_iter=10000).fit(X_train_5,y_train)

    Lasso_R2_test_score=linelasso.score(X_test_5, y_test)

    return (LinearRegression_R2_test_score,Lasso_R2_test_score)

print(answer_four())

In [None]:
#Part 2 - Classification

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

mush_df = pd.read_csv('mushrooms.csv')

mush_df2 = pd.get_dummies(mush_df)

X_mush = mush_df2.iloc[:,2:]
y_mush = mush_df2.iloc[:,1]

# use the variables X_train2, y_train2 for Question 5
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_mush, y_mush, random_state=0)

# For performance reasons in Questions 6 and 7, we will create a smaller version of the
# entire mushroom dataset for use in those questions.  For simplicity we'll just re-use
# the 25% test split created above as the representative subset.
#
# Use the variables X_subset, y_subset for Questions 6 and 7.
X_subset = X_test2
y_subset = y_test2


In [None]:
# Question 5
# Using X_train2 and y_train2 from the preceeding cell, train a DecisionTreeClassifier with default parameters and 
# random_state=0. What are the 5 most important features found by the decision tree?

# As a reminder, the feature names are available in the X_train2.columns property, and the order of the features 
# in X_train2.columns matches the order of the feature importance values in the classifier's feature_importances_ property.

# This function should return a list of length 5 containing the feature names in descending order of importance.

# Note: remember that you also need to set random_state in the DecisionTreeClassifier.

In [None]:
def answer_five():
    from sklearn.tree import DecisionTreeClassifier

    clf=DecisionTreeClassifier(random_state = 0).fit(X_train2,y_train2)

    x=pd.DataFrame({'features':clf.feature_importances_,'Names':X_train2.columns}) 

    x.sort_values(by=['features'],inplace=True, ascending=False)

    x=x.reset_index(drop=True)
    
    return list(x.iloc[0:5,0])

print(answer_five())


In [None]:
# Question 6
# For this question, we're going to use the validation_curve function in sklearn.model_selection to determine training and 
# test scores for a Support Vector Classifier (SVC) with varying parameter values. Recall that the validation_curve function, 
# in addition to taking an initialized unfitted classifier object, takes a dataset as input and does its own internal 
# train-test splits to compute results.

# Because creating a validation curve requires fitting multiple models, for performance reasons this question will use just 
# a subset of the original mushroom dataset: please use the variables X_subset and y_subset as input to the validation curve 
# function (instead of X_mush and y_mush) to reduce computation time.

# The initialized unfitted classifier object we'll be using is a Support Vector Classifier with radial basis kernel. 
# So your first step is to create an SVC object with default parameters (i.e. kernel='rbf', C=1) and random_state=0. 
# Recall that the kernel width of the RBF kernel is controlled using the gamma parameter.

# With this classifier, and the dataset in X_subset, y_subset, explore the effect of gamma on classifier accuracy by using
# the validation_curve function to find the training and test scores for 6 values of gamma from 0.0001 to 10 
# (i.e. np.logspace(-4,1,6)). Recall that you can specify what scoring metric you want validation_curve to use by setting 
# the "scoring" parameter. In this case, we want to use "accuracy" as the scoring metric.

# For each level of gamma, validation_curve will fit 3 models on different subsets of the data, returning two 6x3 
# (6 levels of gamma x 3 fits per level) arrays of the scores for the training and test sets.

#Find the mean score across the three models for each level of gamma for both arrays, creating two arrays of length 6, 
# and return a tuple with the two arrays.

In [None]:
def answer_six():
    from sklearn.svm import SVC
    from sklearn.model_selection import validation_curve
    
    param_range = np.logspace(-4,1,6)
    
    train_scores, test_scores = validation_curve(SVC(), X_subset , y_subset,
                                                      param_name='gamma',
                                                      param_range=param_range, cv=3)

    train_scores_mean = np.mean(train_scores, axis=1)
    
    test_scores_mean = np.mean(test_scores, axis=1)
    
    return ( train_scores_mean,  test_scores_mean)
    
print(answer_six())

In [None]:
# Question 7
# Based on the scores from question 6, what gamma value corresponds to a model that is underfitting (and has the worst test 
# set accuracy)? What gamma value corresponds to a model that is overfitting (and has the worst test set accuracy)? 
# What choice of gamma would be the best choice for a model with good generalization performance on this dataset 
# (high accuracy on both training and test set)?

# Hint: Try plotting the scores from question 6 to visualize the relationship between gamma and accuracy. Remember to comment 
# out the import matplotlib line before submission.

# This function should return one tuple with the degree values in this order: (Underfitting, Overfitting, Good_Generalization)
# Please note there is only one correct solution.

In [None]:
def answer_seven():
    
    df_new=pd.DataFrame({'train_scores_mean':answer_six()[0],'test_scores_mean':answer_six()[1],'gamma_value':np.logspace(-4,1,6)})
    
    df_new['score_diff']=df_new['train_scores_mean']-df_new['test_scores_mean']
    
    df_new.set_index('gamma_value',inplace=True)
    
    df_new.sort_values(by=['score_diff'])
    
    Good_Generalization=df_new.index[0]

    df_new.sort_values(by=['train_scores_mean'])
    
    Overfitting=df_new.index[-1]

    df_new.sort_values(by=['test_scores_mean'])
    
    Underfitting=df_new.index[0]
    
    return (Underfitting,Overfitting,Good_Generalization)

print(answer_seven())