# Grid Search

In [None]:
from numpy import poly1d
import numpy as np
from scipy import linalg, sparse, misc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets, preprocessing
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# Metrics
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
# Model
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

# With naive bayes
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3)}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

gs_clf.best_score_
gs_clf.best_params_

# With SVM
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', 
                                                   penalty='l2',
                                                   alpha=1e-3, 
                                                   max_iter=10, 
                                                   random_state=42))])

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)

gs_clf_svm.best_score_
gs_clf_svm.best_params_

# LDA: Latent Dirichlet Allocations

https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

# Linear Regression

In [None]:
from numpy import poly1d
import numpy as np
from scipy import linalg, sparse, misc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets, preprocessing
# Preprocessing
from sklearn.model_selection import train_test_split
# Metrics
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
# Model
from sklearn import linear_model

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Use only one feature, the third one
diabetes_X = diabetes_X[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

# Logistic regression

In [None]:
from numpy import poly1d
import numpy as np
from scipy import linalg, sparse, misc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets, preprocessing
# Preprocessing
from sklearn.model_selection import train_test_split
# Metrics
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
# Model
from sklearn import linear_model

# import some data to play with
iris = datasets.load_iris()

X = iris.data[:, :2]  # we only take the first two features.

Y = iris.target

# C : float, default=1.0 Inverse of regularization strength; 
# must be a positive float. Like in support vector machines, 
# smaller values specify stronger regularization.
logreg = linear_model.LogisticRegression(C=1e5)

# Create an instance of Logistic Regression Classifier and fit the data.
logreg.fit(X, Y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = .02  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z, cmap = plt.cm.Paired)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap = plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

# Naive Bayes

In [None]:
from numpy import poly1d
import numpy as np
from scipy import linalg, sparse, misc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets, preprocessing
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# Metrics
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
# Model
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# loading the training data
twenty_train = datasets.fetch_20newsgroups(subset='train', shuffle=True)

# prints all the categories
twenty_train.target_names

# prints first
print("\n".join(twenty_train.data[0].split("\n")[:3]))

# using bag of words model
# segment each text file into words 
# (for English splitting by space), 
# and count # of times each word occurs in each document 
# and finally assign each word an integer id.
# Each unique word in our dictionary will correspond to a feature (descriptive feature).

# create feature vectors
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

# 
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

# Model
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

# Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())])

# Stop words
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())])

# MultinomialNB(FitPrior=False), a uniform prior will be used. 


# Train
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

# Test
twenty_test = datasets.fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

# Polynomial fitting

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

def train_and_predict(train_input_features, train_outputs, prediction_features):
    """
    :param train_input_features: (numpy.array) A two-dimensional NumPy array where each element
                        is an array that contains two numerical features
    :param train_outputs: (numpy.array) A one-dimensional NumPy array where each element
                        is the value associated with the same row of train_input_features
    :param prediction_features: (numpy.array) A two-dimensional NumPy array where each element
                        is an array that contains two numerical features
    :returns: (list) The function should return an iterable (like list or numpy.ndarray) of predictions,
                        one for each item in prediction_features
    """   
    
    parameters = {'poly__degree': (np.arange(4))}

    pipeline = Pipeline([('scale', StandardScaler()),
                        ('poly', PolynomialFeatures(degree=3)),
                        ('reg', LinearRegression())])
    
    
    # Grid Search to find the best polynomial degree although it was already specified in the question
    # gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1)
    # gs_clf = gs_clf.fit(train_input_features, train_outputs)    
    # return gs_clf.predict(prediction_features)
    
    model = pipeline.fit(train_input_features, train_outputs)
    return model.predict(prediction_features)

#Example case
np.random.seed(1)
data = np.random.normal(size=(200, 2))
result = 2 * data[:, 0] ** 3 + 4 * data[:, 1]
X_train, X_test, y_train, y_test = train_test_split(data, result,
                                                    test_size=0.3, random_state=0)

y_pred = train_and_predict(X_train, y_train, X_test)
if y_pred is not None:
    print(metrics.mean_squared_error(y_test, y_pred))

# SVM: Support Vector Machine

In [None]:
from numpy import poly1d
import numpy as np
from scipy import linalg, sparse, misc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets, preprocessing
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# Metrics
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
# Model
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier

twenty_train = datasets.fetch_20newsgroups(subset='train', shuffle=True)

text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', 
                                                   penalty='l2',
                                                   alpha=1e-3, 
                                                   max_iter=10, 
                                                   random_state=42))])

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

predicted_svm = text_clf_svm.predict(twenty_test.data)

np.mean(predicted_svm == twenty_test.target)

# Question
You want to create a machine learning algorithm that finds the top 100 people who have shared photographs of themselves on social media. What is the best machine learning method to use ?
* [ ] K-nearest neighbor
* [ ] binary classification
* [ ] unsupervised learning
* [ ] reinforcement learning


# Question

Which pattern exhibits the following figure of the fit line and data?
![alt](figure-1.png)

* [ ] high bias, low variance
* [ ] high bias, high variance
* [ ] low bias, low variance
* [ ] low bias, high variance


# Question

You want to create a machine learning algorithm to identify food recipes on the web. To do this, you create an algorithm that looks at different conditional probabilities. So if the post includes the word *flour*, it has a slightly stronger probability of being a recipe. If it contains both *flour* and *sugar*, is is even more likely a recipe. What type of algorithm are you using?
* [ ] K-nearest neighbor
* [ ] naive Bayes classifier
* [ ] multiclass classification
* [ ] decision tree


# Question
In statistics, what is defined as the probability of a hypothesis test of finding an effect, if there is an effect to be found?
* [ ] significance
* [ ] alpha
* [ ] power
* [ ] confidence


# Question
You create a decision tree to show whether someone decides to go to the beach. There are three factors in this decision: rainy, overcast and sunny. What are these three factors called?
* [ ] predictors
* [ ] tree nodes
* [ ] deciders
* [ ] root nodes


# Question
You are part of a data science team that is working for a national fast-food chain. You create a simple report that shows a trend: Customers who visit the store more often and buy smaller meals spend more than customers who visit less frequently and buy larger meals. What is the most likely diagram that your team created?

* [ ] multiclass classification diagram
* [ ] linear regression and scatter plots
* [ ] K-means cluster diagram
* [ ] pivot table
