# Classify articles using labeled data

In [1]:
import pandas as pd
import numpy as np

## Data source is 290 downloaded articles from the Training Data

In [2]:
df = pd.read_csv('https://s3-us-west-1.amazonaws.com/simon.bedford/d4d/article_contents.csv')
df = df.fillna('')

In [3]:
df.head()

Unnamed: 0,country,url,tag,title,meta_description,content
0,Afghanistan,http://www.independent.co.uk/news/world/asia/1...,Disasters,160 killed and hundreds left stranded by flood...,Flash flooding across Afghanistan and Pakistan...,Flash flooding across Afghanistan and Pakistan...
1,Afghanistan,http://floodlist.com/asia/afghanistan-flash-fl...,Disasters,Afghanistan – Flash Floods in Faryab and Baghl...,,"Afghanistan state news agency, Bakhtar News Ag..."
2,Afghanistan,http://floodlist.com/asia/afghanistan-6-dead-f...,Disasters,Afghanistan - 6 Dead as Flash Floods Hit Badak...,,Flash floods have struck once again in the Bad...
3,Afghanistan,http://reliefweb.int/report/afghanistan/afghan...,Disasters,Afghanistan Earthquake: Overview of Assessed N...,Afghanistan Earthquake: OCHA Situation Report ...,UN Office for the Coordination of Humanitarian...
4,Albania,http://www.euronews.com/2014/11/19/albania-flo...,Disasters,Albania floods kill at least 3 people | Euronews,Flooding in Albania has killed at least three ...,Flooding in Albania has killed at least three ...


In [4]:
df.groupby("tag").count()

Unnamed: 0_level_0,country,url,title,meta_description,content
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Conflict and violence,31,31,31,31,31
Disasters,260,260,260,260,260


## Define custom Transformers for use in pipelines

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer

class FSTransformer(BaseEstimator, TransformerMixin):
  """
  Returns the different feature names
  """
  def __init__(self, features):
    self.features = features
    pass

  def fit(self, X, y):
    return self
  
  def transform(self, df):
    return df[self.features].as_matrix()

  
class CountVecTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, col):
    self.vectorizer = CountVectorizer(binary=False)
    self.col = col
    pass
  
  def fit(self, df, y=None):
    self.vectorizer.fit(df[self.col])
    return self
  
  def transform(self, df):
    return self.vectorizer.transform(df[self.col])

cvt = CountVecTransformer("url")
X = cvt.fit_transform(df)

### Convert string label to integer

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder().fit(df.tag)
y = le.transform(df.tag)

### Predict class based only on the frequency of the labels

In [7]:
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer

for i in range(5):
  clf_dummy = DummyClassifier(strategy="stratified", random_state=i).fit(X, y)
  print(clf_dummy.score(X, y))

0.810996563574
0.790378006873
0.824742268041
0.810996563574
0.776632302405


In [8]:
from sklearn.linear_model import RidgeClassifier

clf_ridge = RidgeClassifier().fit(X, y)
print(clf_ridge.score(X, y))

1.0


### Need to cross-validate

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score

def get_model_scores(model, X, y):
  sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)

  for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index, :], X[test_index, :]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    r2_train = model.score(X_train, y_train)
    r2_test = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print("R2_train: {0} R2_test: {1} f1: {2}".format(r2_train, r2_test, f1))

get_model_scores(clf_ridge, X, y)

R2_train: 1.0 R2_test: 0.9204545454545454 f1: 0.9575757575757576
R2_train: 1.0 R2_test: 0.9090909090909091 f1: 0.9506172839506173
R2_train: 1.0 R2_test: 0.9659090909090909 f1: 0.9811320754716981
R2_train: 1.0 R2_test: 0.9318181818181818 f1: 0.9634146341463414
R2_train: 1.0 R2_test: 0.9204545454545454 f1: 0.9565217391304348


## TODO: Decide if we need high precision, high recall, or a balance