## Read in Data

In [1]:
#Create Dataframe
import pandas as pd

df = pd.read_csv('/Users/forsythd/Desktop/article_contents.csv')

#Change tag to Binary 1/0
df['tag'] = df['tag'].map({'Disasters': 0, 'Conflict and violence': 1}) 

#Re Order Columns
df = df[['country', 'url', 'title', 'meta_description', 'content','tag']]

df.head()

Unnamed: 0,country,url,title,meta_description,content,tag
0,Afghanistan,http://www.independent.co.uk/news/world/asia/1...,160 killed and hundreds left stranded by flood...,Flash flooding across Afghanistan and Pakistan...,Flash flooding across Afghanistan and Pakistan...,0
1,Afghanistan,http://floodlist.com/asia/afghanistan-flash-fl...,Afghanistan – Flash Floods in Faryab and Baghl...,,"Afghanistan state news agency, Bakhtar News Ag...",0
2,Afghanistan,http://floodlist.com/asia/afghanistan-6-dead-f...,Afghanistan - 6 Dead as Flash Floods Hit Badak...,,Flash floods have struck once again in the Bad...,0
3,Afghanistan,http://reliefweb.int/report/afghanistan/afghan...,Afghanistan Earthquake: Overview of Assessed N...,Afghanistan Earthquake: OCHA Situation Report ...,UN Office for the Coordination of Humanitarian...,0
4,Albania,http://www.euronews.com/2014/11/19/albania-flo...,Albania floods kill at least 3 people | Euronews,Flooding in Albania has killed at least three ...,Flooding in Albania has killed at least three ...,0


In [2]:
#Value Counts on tag
df['tag'].value_counts()

0    260
1     31
Name: tag, dtype: int64

In [3]:
#replace nan with space in meta description and content
df['meta_description'].fillna('', inplace=True)
df['content'].fillna('', inplace=True)
df['title'].fillna('', inplace=True)

## Split Data into a CV Set [70%] and an Evaluation Set [30%]

In [4]:
#Split Data into a CV Set 70% and an Evaluation Set 30%
from sklearn.model_selection import train_test_split
X_CV, X_Eval, y_CV, y_Eval = train_test_split(df, df['tag'], test_size=0.3,
                                                    random_state=0,stratify=df['tag'])

In [5]:
#Print CV Set Stats
print('Length',len(X_CV))
print()
print('Value Counts')
print(X_CV['tag'].value_counts())

Length 203

Value Counts
0    181
1     22
Name: tag, dtype: int64


In [6]:
#Print Eval Set Stats
print('Length',len(X_Eval))
print()
print('Value Counts')
print(X_Eval['tag'].value_counts())

Length 88

Value Counts
0    79
1     9
Name: tag, dtype: int64


## Grid Search CV on CV set to find optimal Model Hyperparamaters

In [7]:
#Pull Appropriate Features
combined_text = X_CV['title'] +' '+ X_CV['meta_description'] +' '+  X_CV['content']

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer(ngram_range=(1,4),stop_words='english')
X_CV_counts = count_vect.fit_transform(combined_text)


tfidf_transformer = TfidfTransformer()
X_CV_tfidf = tfidf_transformer.fit_transform(X_CV_counts)



X_CV_tfidf

<203x127424 sparse matrix of type '<class 'numpy.float64'>'
	with 152508 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 
                     'gamma': [.01, .03, 0.1, 0.3, 1.0, 3.0],
                     'class_weight':[{0:1,1:1},{0:1, 1:4}, {0:1, 1:5}, {0:1, 1:10}],
                     'C': [1/x for x in [0.1, 0.3, 1.0, 3.0, 10.0]]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_CV_tfidf, y_CV)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Best parameters set found on development set:

{'class_weight': {0: 1, 1: 4}, 'kernel': 'rbf', 'gamma': 0.03, 'C': 10.0}

Grid scores on development set:

0.446 (+/-0.010) for {'class_weight': {0: 1, 1: 1}, 'kernel': 'rbf', 'gamma': 0.01, 'C': 10.0}
0.446 (+/-0.010) for {'class_weight': {0: 1, 1: 1}, 'kernel': 'rbf', 'gamma': 0.03, 'C': 10.0}
0.652 (+/-0.504) for {'class_weight': {0: 1, 1: 1}, 'kernel': 'rbf', 'gamma': 0.1, 'C': 10.0}
0.446 (+/-0.010) for {'class_weight': {0: 1, 1: 1}, 'kernel': 'rbf', 'gamma': 0.3, 'C': 10.0}
0.446 (+/-0.010) for {'class_weight': {0: 1, 1: 1}, 'kernel': 'rbf', 'gamma': 1.0, 'C': 10.0}
0.446 (+/-0.010) for {'class_weight': {0: 1, 1: 1}, 'kernel': 'rbf', 'gamma': 3.0, 'C': 10.0}
0.446 (+/-0.010) for {'class_weight': {0: 1, 1: 4}, 'kernel': 'rbf', 'gamma': 0.01, 'C': 10.0}
0.753 (+/-0.507) for {'class_weight': {0: 1, 1: 4}, 'kernel': 'rbf', 'gamma': 0.03, 'C': 10.0}
0.652 (+/-0.504) for {'class_weight': {0: 1, 1: 4}, 'kernel': 'rbf', 'gamma': 0.1, 'C': 1

In [9]:
#Best model from GS for recall
clf.best_params_


{'C': 10.0, 'class_weight': {0: 1, 1: 4}, 'gamma': 0.03, 'kernel': 'rbf'}

## Train model with best parameters and test on evaluation set 

In [10]:
combined_text_eval = X_Eval['title'] + ' ' + X_Eval['meta_description'] + ' ' +  X_Eval['content']

X_Eval_counts = count_vect.transform(combined_text_eval)

X_Eval_tfidf = tfidf_transformer.transform(X_Eval_counts)


In [11]:
from sklearn import svm
#Train model on all of CV training set
clf = svm.SVC(kernel='rbf', C=10.0,gamma=.03,class_weight={0: 1, 1: 4}).fit(X_CV_tfidf, y_CV)
#Score on Evaluation Set
clf.score(X_Eval_tfidf, y_Eval)

0.97727272727272729

In [12]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_Eval_tfidf)
target_names = ['Disasters', 'Conflict and violence']
print(classification_report(y_Eval, y_pred, target_names=target_names))

                       precision    recall  f1-score   support

            Disasters       0.98      1.00      0.99        79
Conflict and violence       1.00      0.78      0.88         9

          avg / total       0.98      0.98      0.98        88



In [19]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_Eval, y_pred)

array([[79,  0],
       [ 2,  7]])

In [13]:
#Create DF of eval text, actual y, and predicted y
pd.options.display.max_colwidth = 150

pred_df = pd.DataFrame({ 'text' : combined_text_eval,
                         'actual' : y_Eval,
                         'pred' : y_pred})

pred_df = pred_df[['text', 'actual','pred']]
pred_df.head()

Unnamed: 0,text,actual,pred
54,SW China landslide death toll rises to 6 - Xinhua | English.news.cn SW China landslide death toll rises to 6---At least six people were killed and...,0,0
1,"Afghanistan – Flash Floods in Faryab and Baghlan Leave 8 Dead Afghanistan state news agency, Bakhtar News Agency (BNA) report that at least 7 peo...",0,0
217,"NDRRMC Update: SitRep No.18 re Preparedness Measures for Tropical Storm ""Lando"" (I.N. Koppu) (Extract) II. EFFECTS A. AFFECTED POPULATION (TAB A) ...",0,0
258,"Northeastern provinces slammed by summer storm KALASIN, 20 March 2014 (NNT) – Several provinces in the Northeast of Thailand, including Kalasin, U...",0,0
29,Environment and Climate Change Canada - Weather and Meteorology - Canada's Top Ten Weather Stories for 2014 Floods were the big newsmakers in Cana...,0,0


In [18]:
#Correctly predicted as Disaster
pred_df[(pred_df.actual ==0) & (pred_df.pred ==0)]

Unnamed: 0,text,actual,pred
54,SW China landslide death toll rises to 6 - Xinhua | English.news.cn SW China landslide death toll rises to 6---At least six people were killed and...,0,0
1,"Afghanistan – Flash Floods in Faryab and Baghlan Leave 8 Dead Afghanistan state news agency, Bakhtar News Agency (BNA) report that at least 7 peo...",0,0
217,"NDRRMC Update: SitRep No.18 re Preparedness Measures for Tropical Storm ""Lando"" (I.N. Koppu) (Extract) II. EFFECTS A. AFFECTED POPULATION (TAB A) ...",0,0
258,"Northeastern provinces slammed by summer storm KALASIN, 20 March 2014 (NNT) – Several provinces in the Northeast of Thailand, including Kalasin, U...",0,0
29,Environment and Climate Change Canada - Weather and Meteorology - Canada's Top Ten Weather Stories for 2014 Floods were the big newsmakers in Cana...,0,0
160,GLIDE Record Record-breaking winds on Okinawa's Yonaguni Island have destroyed at least 10 houses and damaged more than 200 others. Gusts of near...,0,0
22,"Brunei, Flood and Landslide in Tutong | ADInet Continuous downpour has generated heavy flooding and landslide in Tutong district. About 4 subdist...",0,0
273,"Uganda Floods Destroy Crops Floods sweeping across eastern Uganda have destroyed thousands of hectares of crops, aid officials said, warning that ...",0,0
59,"Typhoon Chan-Hom weakens, leaves east China regions - Xinhua | English.news.cn The rapid decline of the strength of Typhoon Chan-Hom has given a b...",0,0
32,Disaster Management Information System (DMIS) | Login The Disaster Management Information System (DMIS) is a web-based working tool made accessib...,0,0


In [14]:
#Correctly predicted as Conflict and Violence
pred_df[(pred_df.actual ==1) & (pred_df.pred ==1)]

Unnamed: 0,text,actual,pred
272,IDMC » Turkey IDP Figures Analysis The figure is based on a 2006 study commissioned by the government and carried out by Hacettepe University. Th...,1,1
62,"IDMC » Congo IDP Figures Analysis IDMC bases its estimates on the Republic of Congo’s own government estimate, which was published in its Displac...",1,1
11,"IDMC » Armenia IDP Figures Analysis A profiling exercise led by NRC in 2005 found that 65,000 families were displaced during the 1988-1994 confli...",1,1
67,"IDMC » Cyprus IDP Figures Analysis The figure is the number of people registered as IDPs by the government of the Republic of Cyprus (GoC, 2014) ...",1,1
109,"Rajnath to meet Tripura, Mizoram CMs on Bru rehab - Times of India Union home minister Rajnath Singh will meet the chief ministers of Tripura and ...",1,1
89,"IDMC » Georgia IDP Figures Analysis The estimate is a composite of figures reported by the government of Georgia, UNHCR, and the UN Inter-Agency ...",1,1
76,Egypt has complied with international law in Sinai: Cabinet - Politics - Egypt - Ahram Online Egypt has complied with international human rights ...,1,1


In [16]:
#Incorrectly predicted as Disaster
pred_df[(pred_df.actual ==1) & (pred_df.pred ==0)]

Unnamed: 0,text,actual,pred
112,Kashmir: Civilians flee as border fighting continues Thousands of villagers flee their homes in Indian-administered Kashmir as Indian and Pakistan...,1,0
194,Niger: Humanitarian Dashboard (December 2015) Niger is facing major humanitarian challenges exacerbated by the consequences of the conflicts in Ma...,1,0


In [17]:
#Incorrectly predicted as Conflict and Violence
pred_df[(pred_df.actual ==0) & (pred_df.pred ==1)]

Unnamed: 0,text,actual,pred
