In [1]:
## Intro to Machine Learning

import numpy as np

## Sci Kit Learn library (specifially a Gaussian Naive Bayes line)
from sklearn.naive_bayes import GaussianNB


In [2]:
## Setting X and Y as sample data
## X = features and Y = labels

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])

In [3]:
## Create Classifier
clf = GaussianNB()

## Fit the Classifier
## This gives the classifier the "training" data and the classifier will learn the patterns we are evaluating
clf.fit(X, Y)

## Use the trained classifier to make predictions
pred = clf.predict([[-0.8, -1]])

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.

    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""
    
import sys
import numpy
from time import time
sys.path.append(r"C:\Users\jonat\Desktop\MachineLearningClass\ud120-projects-master\tools")
from email_preprocess import preprocess
from sklearn import svm
from sklearn.metrics import accuracy_score

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()


t0 = time()
clf = svm.SVC(kernel = 'linear')
clf.fit(features_train, labels_train)

print("training time:", round(time()-t0, 3), "s")

t0 = time()
clf.predict(features_test, labels_test)
print("training time:", round(time()-t0, 3), "s")

print(accuracy_score(features_test, labels_test))

In [4]:
## Running prediction function
## Using the patterns that were run through the fit function to predict where the data we pass in would fit in to.
print(clf.predict([[-0.8, -1]]))

[1]


In [None]:
## Printing the Accuracy
## Accuracy = no. of points classified correctly / all points in test set

accuracy = clf.score(features_test, labels_test)
return accuracy

## Alternative method to show accuracy

from sklearn.metrics import accuracy_score
print accuracy_score(pred, labels_test)

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

In [14]:
## Coding an SVM

from sklearn.svm import SVC

X = [[0, 0], [1, 1]]
Y = [0, 1]

clf = svm.SVC()
clf.fit(X, Y)

SVC(C = 1.0, cache_size=200, class_weight = None, coef0 = 0.0, degree = 3, gamma = 'auto',
kernel = 'rbf', max_iter = -1, probability = False, random_state = None, shrinking = True,
tol = 0.001, verbose = False)

clf.predict([2., 2.])



ValueError: Expected 2D array, got 1D array instead:
array=[2. 2.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
## Decision Tree classifier

## Import statement for Decision Trees
from sklearn import tree

## Import for visual example of Decision boundary
from class_vis import prettyPicture

>>> X = [[0, 0], [1, 1]]
>>> Y = [0, 1]
>>> clf = tree.DecisionTreeClassifier()
>>> clf = clf.fit(X, Y)


## Prediction
clf.predict([[2., 2.]])

## Visualization of
prettyPicture(clf, features_test, labels_test)
output_image("test.png", "png", open("test.png", "rb").read())

In [1]:
## Practice Code

import sys
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData
from sklearn import tree
import numpy as np
import pylab as pl

features_train, labels_train, features_test, labels_test = makeTerrainData()



#################################################################################


########################## DECISION TREE #################################

clf = tree.DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)

acc = clf.score(features_test, labels_test)

    
def submitAccuracies():
  return {"acc":round(acc,3)}


ModuleNotFoundError: No module named 'class_vis'

In [3]:
# Regression Notes

# Import statement
from sklearn import linear_model

reg = linear_model.LinearRegression()
reg.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
reg.coef_

array([0.5, 0.5])

In [4]:
# More detailed Regression example

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

Coefficients: 
 [938.23786125]
Mean squared error: 2548.07
Coefficient of determination: 0.47


<Figure size 640x480 with 1 Axes>

In [2]:
# Feature Scaling
# Min/Max Scaler for sklearn

from sklearn.preprocessing import MinMaxScaler
import numpy

# this is expecting floats instead of ints so I placed decimal points
weights = numpy.array([[115.], [140.], [175.]])

scaler = MinMaxScaler()

# calls fit transform - does 2 things
# 1.) Finds min(x), max(x), etc
# 2.) Transform applies the formula to all the elements in the set of data
rescaled_weight = scaler.fit_transform(weights)
rescaled_weight

array([[0.        ],
       [0.41666667],
       [1.        ]])

In [4]:
# Text Learning
# Bag of Words

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

string1 = "hi Katie the self driving care will be late Best Sebastian"
string2 = "hi Sebastian the machine learning class will be great great great Best Katie"
string3 = "Hi Katie the machine learning class will be most excellent"

# for this example we put the strings into a list
email_list = [string1, string2, string3]

# Fit and transform the data
bag_of_words = vectorizer.fit(email_list)
bag_of_words = vectorizer.transform(email_list)

'''print(bag_of_words)'''
# prints the feature number from the bag of words
print(vectorizer.vocabulary_.get("great"))

6


In [3]:
# List of stopwords from NLTK
# NLTK - National Language Tool Kit

#from nltk.corpus import stopwords
import nltk
sw = stopwords.words("english")
nltk.download()

NameError: name 'stopwords' is not defined

In [None]:
# Principle Component Analysis
# PCA

def doPCA():
    from sklearn.decomposition import PCA
    pca = PCA(n_components = 2)
    pca.fit(data)
    return pca

pca = doPCA()
print(pca.explained_variance_ratio_)
first_pc = pca.components_[0]
second_pc = pca.components_[1]
