In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')

## 1: Face Recognition, but not evil this time

Using the faces dataset in:

```
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
```

If you use the `faces.target` and `faces.target_names` attributes, you can build a facial recognition algorithm.

Use sklearn **gridsearch** (or an equivalent, like random search) to optimize the model for accuracy. Try both a SVM-based classifier and a logistic regression based classifier (with a feature pipeline of your choice) to get the best model. You should have at least 80% accuracy.

In [2]:
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
x = faces.data
y = faces.target

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

In [5]:
# REFRESH = False
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)

if REFRESH:
    
    pipe = Pipeline([
    ('reg', LogisticRegression())
    ])

    param_grid = [
    {'reg__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'reg__fit_intercept': [True, False],
    'reg__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}]

    grid = GridSearchCV(pipe, param_grid, cv=7)
    grid.fit(X_train, y_train)
print(grid.best_params_)

logreg = grid.best_estimator_
model_log = logreg.fit(X_train, y_train)
trial_run = logreg.predict(X_test)

print(confusion_matrix(trial_run, y_test))
print(accuracy_score(trial_run, y_test))

#This takes forever to run, but at the time I ran it it retruned an accuracy score of 0.817283950617284
# THe best parameters were {'reg__fit_intercept': True, 'reg__penalty': 'l2', 'reg__solver': 'newton-cg'}

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\David\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in

In [26]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
# REFRESH = False
if REFRESH:
    pipe = Pipeline([
    ('scaler', preprocessing.StandardScaler()),
    ('reg', SVC())
    ])

    param_grid = [
    {'reg__kernel': ['linear', 'poly', 'rbf'],
    'reg__probability': [True, False]
    }]

    grid = GridSearchCV(pipe, param_grid, cv=7)
    grid.fit(X_train, y_train)
print(grid.best_params_)


svc = grid.best_estimator_
model_svc = svc.fit(X_train, y_train)
trial_run = svc.predict(X_test)

print(confusion_matrix(trial_run, y_test))
print(accuracy_score(trial_run, y_test))

#This also takes forever to run, but at the time I ran it it retruned an accuracy score of 0.8049382716049382
# THe best parameters were {'reg__kernel': 'linear', 'reg__probability': True}

{'reg__kernel': 'linear', 'reg__probability': True}
[[ 13   5   0   0   1   0   0   3]
 [  2  57   1   5   0   0   0   0]
 [  3   1  34   8   1   0   0   1]
 [  0   8   3 139   3   2   4   2]
 [  2   0   1   3  24   6   0   2]
 [  0   1   0   1   0  11   0   0]
 [  0   0   0   0   2   0  16   0]
 [  0   2   0   5   1   0   0  32]]
0.8049382716049382


# 2: Bag of Words, Bag of Popcorn

By this point, you are ready for the [Bag of Words, Bag of Popcorn](https://www.kaggle.com/c/word2vec-nlp-tutorial/data) competition. 

Use NLP feature pre-processing (using, SKLearn, Gensim, Spacy or Hugginface) to build the best classifier you can. Use a  feature pipeline, and gridsearch for your final model.

A succesful project should get 90% or more on a **holdout** dataset you kept for yourself.

In [34]:
#Let's build a few methods to clean the data we have
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import Normalizer
from bs4 import BeautifulSoup
import string
import re

def clean_me(text):
    text = BeautifulSoup(text).get_text()
    text = re.sub("[^a-zA-Z]", " ", text)  
    stop_words = stopwords.words('english')
    stop_words += list(string.punctuation)
    token = word_tokenize
    tokens = list(token(text))
    stopwords_removed = [token.lower() for token in tokens if not token.lower() in stop_words]

    lemmatizer = WordNetLemmatizer()

    return " ".join([lemmatizer.lemmatize(w) for w in stopwords_removed])

In [35]:
#I downloaded the data and saved it. It can now be loaded with this method.
df = pd.read_csv('Kaggle_Data')
#Let's clean it right away
df.review = df.review.apply(clean_me)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(df.review, df.sentiment, test_size = 0.1, random_state=0)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler

In [49]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('norma', MaxAbsScaler()),
    ('reg', LogisticRegression())
])

param_grid = [{
    'tfidf__analyzer' : ['word', 'char'],
    'reg__class_weight': ['balanced', None],
    'reg__fit_intercept': [True, False]
    }]

grid = GridSearchCV(pipe, param_grid, cv=7)
grid.fit(X_train, y_train)

pred = grid.predict(X_test)

print(grid.best_params_)
print(confusion_matrix(pred, y_test))
print(accuracy_score(pred, y_test))

#Once again, takes a minute to run. WHen I did it, it came back with an 0.898 accuracy score.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt