# 3.3 Machine Learning section.

>We will proceed like we did with labels `relevance` but with `positivity`.

>We only have 1,420 datapoints, so we will drop the missing values (inputting to the mean introduces bias, as we described before).

>`positivity` ranges from `2` to `9`, and this time we will use `sklearn.multiclass.OneVsRestClassifier`.

> This strategy consists in fitting one classifier per class. For each classifier, the class is fitted against all the other classes.

In [1]:
reset -fs

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import string
import re
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import nlp_ml_functions
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.preprocessing import label_binarize, MultiLabelBinarizer, binarize, FunctionTransformer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
sns.set_style('white')

## Using `positivity` as labels.

### Loading dataset into a pandas dataframe.

In [3]:
economic_df = pd.read_csv('Full-Economic-News-DFE-839861.csv', encoding='utf-8')
new_column_names = ['unit_id', 'golden', 'unit_state', 'trusted_judgments', 'last_judgment_at','positivity', 'positivity_confidence', 'relevance', 'relevance_confidence', 'article_id', 'article_date', 'article_headline', 'positivity_gold', 'relevance_gold', 'article_text']
economic_df.columns = new_column_names
economic_df = economic_df[np.isfinite(economic_df['positivity'])]

In [4]:
economic_df.positivity.value_counts()

3.0    343
7.0    295
4.0    255
6.0    214
5.0    205
8.0     71
2.0     35
9.0      2
Name: positivity, dtype: int64

#### Check if the labels are balanced.

In [5]:
economic_df.positivity.value_counts()*100/len(economic_df.positivity)

3.0    24.154930
7.0    20.774648
4.0    17.957746
6.0    15.070423
5.0    14.436620
8.0     5.000000
2.0     2.464789
9.0     0.140845
Name: positivity, dtype: float64

#### Converting variables `unit_state` and `golden` to numerical values. Dropping columns `positivity_gold` and `relevance_gold` are empty, we can drop them.

In [6]:
economic_df['unit_state'] = economic_df['unit_state'].apply(lambda x: 1 if x == 'finalized' else 0)
economic_df['golden'] = economic_df['golden'].apply(lambda x: 0 if x == False else 1)
del economic_df['positivity_gold']
del economic_df['relevance_gold']

### Clean up.

In [7]:
economic_df['article_text'] = economic_df['article_text'].apply(nlp_ml_functions.clean_up_article)

## 2- Creating models.

In [8]:
X_article = economic_df['article_text']
y_article = economic_df['positivity']

In [9]:
X_train_article, X_test_article, y_train_article, y_test_article = train_test_split(X_article, y_article, test_size=0.3, random_state=42)

### Logistic Regression.

In [10]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=nlp_ml_functions.process_dataframe_text)),('classifier', LogisticRegression())])

In [11]:
OneVsRestClassifier(pipeline.fit(X_train_article, y_train_article))

OneVsRestClassifier(estimator=Pipeline(steps=[('bow', CountVectorizer(analyzer=<function process_dataframe_text at 0x11462e9d8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          n_jobs=1)

In [12]:
predictions = pipeline.predict(X_test_article)

#### Model evaluation.

In [13]:
print("Mean Accuracy:\n {:.2%}".format(pipeline.score(X_test_article, y_test_article)))

Mean Accuracy:
 24.41%


In [14]:
print("Confusion matrix:\n\n", confusion_matrix(y_test_article, predictions))

Confusion matrix:

 [[ 0  3  1  1  3  1  0  0]
 [ 0 41 15  7 11 22  2  0]
 [ 0 24 18 11 10 12  2  0]
 [ 0 31 10  9 10 14  1  0]
 [ 0 18 10  7  5 16  0  0]
 [ 0 18 16  8 15 31  0  0]
 [ 0  6  4  4  2  5  0  0]
 [ 0  0  0  0  0  2  0  0]]


In [15]:
print("Classification report:\n\n", classification_report(y_test_article, predictions))

Classification report:

              precision    recall  f1-score   support

        2.0       0.00      0.00      0.00         9
        3.0       0.29      0.42      0.34        98
        4.0       0.24      0.23      0.24        77
        5.0       0.19      0.12      0.15        75
        6.0       0.09      0.09      0.09        56
        7.0       0.30      0.35      0.32        88
        8.0       0.00      0.00      0.00        21
        9.0       0.00      0.00      0.00         2

avg / total       0.22      0.24      0.23       426



  'precision', 'predicted', average, warn_for)


### k-nearest neighbors classifier.

In [16]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=nlp_ml_functions.process_dataframe_text)),('classifier', KNeighborsClassifier())])

In [17]:
OneVsRestClassifier(pipeline.fit(X_train_article, y_train_article))

OneVsRestClassifier(estimator=Pipeline(steps=[('bow', CountVectorizer(analyzer=<function process_dataframe_text at 0x11462e9d8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
          n_jobs=1)

In [18]:
predictions = pipeline.predict(X_test_article)

#### Model evaluation.

In [19]:
print("Mean Accuracy:\n {:.2%}".format(pipeline.score(X_test_article, y_test_article)))

Mean Accuracy:
 16.20%


In [20]:
print("Confusion matrix:\n\n", confusion_matrix(y_test_article, predictions))

Confusion matrix:

 [[ 0  3  0  2  1  2  1  0]
 [ 0 26  5 37  7 23  0  0]
 [ 1 29  3 13  6 25  0  0]
 [ 1 27  4 20  5 17  1  0]
 [ 0 10  1 21  1 23  0  0]
 [ 1 27  7 27  5 19  2  0]
 [ 0  5  0  8  1  7  0  0]
 [ 0  0  1  0  0  1  0  0]]


In [21]:
print("Classification report:\n\n", classification_report(y_test_article, predictions))

Classification report:

              precision    recall  f1-score   support

        2.0       0.00      0.00      0.00         9
        3.0       0.20      0.27      0.23        98
        4.0       0.14      0.04      0.06        77
        5.0       0.16      0.27      0.20        75
        6.0       0.04      0.02      0.02        56
        7.0       0.16      0.22      0.19        88
        8.0       0.00      0.00      0.00        21
        9.0       0.00      0.00      0.00         2

avg / total       0.14      0.16      0.14       426



  'precision', 'predicted', average, warn_for)


### Naive-Bayes Multinomial.

In [22]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=nlp_ml_functions.process_dataframe_text)),('classifier', MultinomialNB())])

In [23]:
OneVsRestClassifier(pipeline.fit(X_train_article, y_train_article))

OneVsRestClassifier(estimator=Pipeline(steps=[('bow', CountVectorizer(analyzer=<function process_dataframe_text at 0x11462e9d8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...None, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
          n_jobs=1)

In [24]:
predictions = pipeline.predict(X_test_article)

#### Model evaluation.

In [25]:
print("Mean Accuracy:\n {:.2%}".format(pipeline.score(X_test_article, y_test_article)))

Mean Accuracy:
 25.35%


In [26]:
print("Confusion matrix:\n\n", confusion_matrix(y_test_article, predictions))

Confusion matrix:

 [[ 0  8  0  0  1  0  0  0]
 [ 0 71  9  0  4 14  0  0]
 [ 0 59  8  0  2  8  0  0]
 [ 0 47  9  0  4 15  0  0]
 [ 0 39  5  0  2 10  0  0]
 [ 0 46  5  1  9 27  0  0]
 [ 0 12  2  0  1  6  0  0]
 [ 0  2  0  0  0  0  0  0]]


In [27]:
print("Classification report:\n\n", classification_report(y_test_article, predictions))

Classification report:

              precision    recall  f1-score   support

        2.0       0.00      0.00      0.00         9
        3.0       0.25      0.72      0.37        98
        4.0       0.21      0.10      0.14        77
        5.0       0.00      0.00      0.00        75
        6.0       0.09      0.04      0.05        56
        7.0       0.34      0.31      0.32        88
        8.0       0.00      0.00      0.00        21
        9.0       0.00      0.00      0.00         2

avg / total       0.18      0.25      0.18       426



  'precision', 'predicted', average, warn_for)


### Naive-Bayes Bernoulli.

In [28]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=nlp_ml_functions.process_dataframe_text)),('classifier', BernoulliNB())])

In [29]:
OneVsRestClassifier(pipeline.fit(X_train_article, y_train_article))

OneVsRestClassifier(estimator=Pipeline(steps=[('bow', CountVectorizer(analyzer=<function process_dataframe_text at 0x11462e9d8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...lary=None)), ('classifier', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]),
          n_jobs=1)

In [30]:
predictions = pipeline.predict(X_test_article)

#### Model evaluation.

In [31]:
print("Mean Accuracy:\n {:.2%}".format(pipeline.score(X_test_article, y_test_article)))

Mean Accuracy:
 24.41%


In [32]:
print("Confusion matrix:\n\n", confusion_matrix(y_test_article, predictions))

Confusion matrix:

 [[ 0  9  0  0  0  0  0  0]
 [ 0 91  0  0  0  7  0  0]
 [ 0 70  1  0  0  6  0  0]
 [ 0 66  0  0  0  9  0  0]
 [ 0 48  0  0  0  8  0  0]
 [ 0 75  0  0  1 12  0  0]
 [ 0 18  0  0  0  3  0  0]
 [ 0  2  0  0  0  0  0  0]]


In [33]:
print("Classification report:\n\n", classification_report(y_test_article, predictions))

Classification report:

              precision    recall  f1-score   support

        2.0       0.00      0.00      0.00         9
        3.0       0.24      0.93      0.38        98
        4.0       1.00      0.01      0.03        77
        5.0       0.00      0.00      0.00        75
        6.0       0.00      0.00      0.00        56
        7.0       0.27      0.14      0.18        88
        8.0       0.00      0.00      0.00        21
        9.0       0.00      0.00      0.00         2

avg / total       0.29      0.24      0.13       426



  'precision', 'predicted', average, warn_for)


### RandomForestClassifier.

In [34]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=nlp_ml_functions.process_dataframe_text)),('classifier', RandomForestClassifier())])

In [35]:
OneVsRestClassifier(pipeline.fit(X_train_article, y_train_article))

OneVsRestClassifier(estimator=Pipeline(steps=[('bow', CountVectorizer(analyzer=<function process_dataframe_text at 0x11462e9d8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
          n_jobs=1)

In [36]:
predictions = pipeline.predict(X_test_article)

#### Model evaluation.

In [37]:
print("Mean Accuracy:\n {:.2%}".format(pipeline.score(X_test_article, y_test_article)))

Mean Accuracy:
 22.07%


In [38]:
print("Confusion matrix:\n\n", confusion_matrix(y_test_article, predictions))

Confusion matrix:

 [[ 0  4  1  1  2  1  0  0]
 [ 0 58 13  5  4 17  1  0]
 [ 0 49  6  7  4 11  0  0]
 [ 0 36  8  2 10 19  0  0]
 [ 0 25 10  1  8 12  0  0]
 [ 0 39 16  6  7 20  0  0]
 [ 0  6  2  1  1 11  0  0]
 [ 0  1  0  0  1  0  0  0]]


In [39]:
print("Classification report:\n\n", classification_report(y_test_article, predictions))

Classification report:

              precision    recall  f1-score   support

        2.0       0.00      0.00      0.00         9
        3.0       0.27      0.59      0.37        98
        4.0       0.11      0.08      0.09        77
        5.0       0.09      0.03      0.04        75
        6.0       0.22      0.14      0.17        56
        7.0       0.22      0.23      0.22        88
        8.0       0.00      0.00      0.00        21
        9.0       0.00      0.00      0.00         2

avg / total       0.17      0.22      0.18       426



  'precision', 'predicted', average, warn_for)


### AdaBoostClassifier.

In [40]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=nlp_ml_functions.process_dataframe_text)),('classifier', AdaBoostClassifier())])

In [41]:
OneVsRestClassifier(pipeline.fit(X_train_article, y_train_article))

OneVsRestClassifier(estimator=Pipeline(steps=[('bow', CountVectorizer(analyzer=<function process_dataframe_text at 0x11462e9d8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...m='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))]),
          n_jobs=1)

In [42]:
predictions = pipeline.predict(X_test_article)

#### Model evaluation.

In [43]:
print("Mean Accuracy:\n {:.2%}".format(pipeline.score(X_test_article, y_test_article)))

Mean Accuracy:
 21.83%


In [44]:
print("Confusion matrix:\n\n", confusion_matrix(y_test_article, predictions))

Confusion matrix:

 [[ 0  5  0  1  0  1  2  0]
 [ 0 57  5  3  9 22  2  0]
 [ 0 52  5  2  2 15  1  0]
 [ 1 47  6  5  4 11  1  0]
 [ 0 28  3  1  4 17  3  0]
 [ 0 47  8  4  6 22  1  0]
 [ 0 14  2  2  0  3  0  0]
 [ 0  1  1  0  0  0  0  0]]


In [45]:
print("Classification report:\n\n", classification_report(y_test_article, predictions))

Classification report:

              precision    recall  f1-score   support

        2.0       0.00      0.00      0.00         9
        3.0       0.23      0.58      0.33        98
        4.0       0.17      0.06      0.09        77
        5.0       0.28      0.07      0.11        75
        6.0       0.16      0.07      0.10        56
        7.0       0.24      0.25      0.25        88
        8.0       0.00      0.00      0.00        21
        9.0       0.00      0.00      0.00         2

avg / total       0.20      0.22      0.17       426



  'precision', 'predicted', average, warn_for)


### GradientBoostingClassifier.

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

http://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required

In [46]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=nlp_ml_functions.process_dataframe_text)),('to_dense',FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),('classifier', GradientBoostingClassifier())])

In [47]:
OneVsRestClassifier(pipeline.fit(X_train_article, y_train_article))

OneVsRestClassifier(estimator=Pipeline(steps=[('bow', CountVectorizer(analyzer=<function process_dataframe_text at 0x11462e9d8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False))]),
          n_jobs=1)

In [48]:
predictions = pipeline.predict(X_test_article)

#### Model evaluation.

In [49]:
print("Mean Accuracy:\n {:.2%}".format(pipeline.score(X_test_article, y_test_article)))

Mean Accuracy:
 22.77%


In [50]:
print("Confusion matrix:\n\n", confusion_matrix(y_test_article, predictions))

Confusion matrix:

 [[ 0  6  0  1  1  1  0  0]
 [ 0 49 13  3  4 28  1  0]
 [ 0 40  6  6  3 22  0  0]
 [ 0 29 12  6  3 23  2  0]
 [ 0 22  6  2  4 22  0  0]
 [ 1 29  9  7 10 32  0  0]
 [ 0  6  2  1  2 10  0  0]
 [ 0  1  1  0  0  0  0  0]]


In [51]:
print("Classification report:\n\n", classification_report(y_test_article, predictions))

Classification report:

              precision    recall  f1-score   support

        2.0       0.00      0.00      0.00         9
        3.0       0.27      0.50      0.35        98
        4.0       0.12      0.08      0.10        77
        5.0       0.23      0.08      0.12        75
        6.0       0.15      0.07      0.10        56
        7.0       0.23      0.36      0.28        88
        8.0       0.00      0.00      0.00        21
        9.0       0.00      0.00      0.00         2

avg / total       0.19      0.23      0.19       426



  'precision', 'predicted', average, warn_for)


### SVM.

In [52]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=nlp_ml_functions.process_dataframe_text)),('classifier', SVC())])

In [53]:
OneVsRestClassifier(pipeline.fit(X_train_article, y_train_article))

OneVsRestClassifier(estimator=Pipeline(steps=[('bow', CountVectorizer(analyzer=<function process_dataframe_text at 0x11462e9d8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
          n_jobs=1)

In [54]:
predictions = pipeline.predict(X_test_article)

#### Model evaluation.

In [55]:
print("Mean Accuracy:\n {:.2%}".format(pipeline.score(X_test_article, y_test_article)))

Mean Accuracy:
 23.00%


In [56]:
print("Confusion matrix:\n\n", confusion_matrix(y_test_article, predictions))

Confusion matrix:

 [[ 0  9  0  0  0  0  0  0]
 [ 0 98  0  0  0  0  0  0]
 [ 0 77  0  0  0  0  0  0]
 [ 0 75  0  0  0  0  0  0]
 [ 0 56  0  0  0  0  0  0]
 [ 0 88  0  0  0  0  0  0]
 [ 0 21  0  0  0  0  0  0]
 [ 0  2  0  0  0  0  0  0]]


In [57]:
print("Classification report:\n\n", classification_report(y_test_article, predictions))

Classification report:

              precision    recall  f1-score   support

        2.0       0.00      0.00      0.00         9
        3.0       0.23      1.00      0.37        98
        4.0       0.00      0.00      0.00        77
        5.0       0.00      0.00      0.00        75
        6.0       0.00      0.00      0.00        56
        7.0       0.00      0.00      0.00        88
        8.0       0.00      0.00      0.00        21
        9.0       0.00      0.00      0.00         2

avg / total       0.05      0.23      0.09       426



  'precision', 'predicted', average, warn_for)
