In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools

In [None]:
ds = 'radicalism_causes'
train_percent = .5

In [None]:
data_key = {
    'poverty_causes': 
    {
        'file': 'data/poverty_causes.csv',
        'text': 'Response', 
        'class': 'pcGEDO1', 
        'name': 'Causes of Poverty'
    }, 
    'poverty_solutions':
    {
        'file': 'data/poverty_solutions.csv', 
        'text': 'Response', 
        'class': 'PSGEDO 1',
        'name': 'Solutions to Poverty'
        
    },
    'poverty_combined': 
    {
        'file': None, 
        'text': 'Response', 
        'class': 'pcGEDO1', 
        'name': 'Causes and Solutions'
    }, 
    'radicalism_solutions': 
    {
        'file': 'data/radicalism_causes_solutions.csv', 
        'text': 'radsolution', 
        'class': 'GEDO 1.1',
        'name': 'Solutions to Radicalism'
    }, 
    'radicalism_causes': 
    {
        'file': 'data/radicalism_causes_solutions.csv', 
        'text': 'radcause', 
        'class': 'GEDO 1',
        'name': 'Causes of Radicalism'
    }
    
}

In [None]:

print('Setting up for', ds)
if ds == 'poverty_combined': 
    df1 = pd.read_csv(data_key['poverty_solutions']['file'])
    df2 = pd.read_csv(data_key['poverty_causes']['file'])
    df1.rename(index=str, columns={'PSGEDO 1':'pcGEDO1'}, inplace=True)

    c1 = df1[[data_key['poverty_solutions']['text'], data_key['poverty_causes']['class']]]
    c2 = df2[[data_key['poverty_causes']['text'], data_key['poverty_causes']['class']]]
    df = pd.concat([c1, c2])
    print(df.head())
else: 
    df = pd.read_csv(data_key[ds]['file'])
    df.head()
    df.keys()


In [None]:
# Remove null
df_not_null = df[df[data_key[ds]['text']].notnull()]
df_not_null = df_not_null[df_not_null[data_key[ds]['class']].notnull()]

df_not_null.shape

# Remove categories that have fewer than 5 
counts = df_not_null[data_key[ds]['class']].value_counts()
remove = counts[counts < 5]
for i, v in remove.items(): 
    print(i)
    print(type(i))
    df_not_null[data_key[ds]['class']] = df_not_null[data_key[ds]['class']][i != df_not_null[data_key[ds]['class']]]
    
df_not_null = df_not_null[df_not_null[data_key[ds]['class']].notnull()]

In [None]:
df_not_null[data_key[ds]['class']].value_counts().plot(kind='bar', title='Category counts')

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
np.array(df_not_null[data_key[ds]['class']])
le.fit(list(df_not_null[data_key[ds]['class']]))
le.transform(list(df_not_null[data_key[ds]['class']]))


In [None]:
# Define the testing and training data
train = df_not_null.sample(frac=train_percent, random_state=42)
test = df_not_null[~df_not_null.index.isin(train.index)]
print('Number of values in training: ', train.shape[0], 'Number of values in testing: ', test.shape[0])
fig, (ax1, ax2) = plt.subplots(1, 2)
train[data_key[ds]['class']].value_counts().plot(kind='bar', title='Training Category Counts', ax=ax1)
test[data_key[ds]['class']].value_counts().plot(kind='bar', title='Testing Category Counts', ax=ax2)
print(type(test[data_key[ds]['class']]))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42,
                        max_iter=5, tol=None)),
])

text_steps = [('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()) ]

clf_proc={}
p = list(text_steps)
p.append(('clf', SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42, max_iter=100, tol=None)))
ngram_list = [(1, 1), (1, 2), (1,3)]
clf_proc['SGDClassifier'] = {
    'pipeline': Pipeline(p), 
    'params': {
              'vect__ngram_range': ngram_list,
              'vect__analyzer': ['word', 'char'],
              'vect__stop_words':['english'],
              'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5),
              'clf__loss':('log', 'hinge', 'modified_huber', 'perceptron', 'huber') 
              },
            } 
p = list(text_steps)
p.append(('clf', MultinomialNB()))

clf_proc['NaiveBayes'] = {
    'pipeline': Pipeline(p), 
    'params': {
              'vect__ngram_range': ngram_list,
              'vect__analyzer': ['word', 'char'],
              'vect__stop_words':['english'],
              'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5),
              },
            } 

p = list(text_steps)
p.append(('clf', LinearSVC()))
clf_proc['LinearSVC'] = {
    'pipeline': Pipeline(p), 
    'params': {
              'vect__ngram_range': ngram_list,
              'vect__analyzer': ['word', 'char'],
              'vect__stop_words':['english'],
              'clf__loss':('hinge', 'squared_hinge'), 
              'clf__C': (1, 1e-2, 1e3, 1e-4)
              },
            } 

# Classification and Scores

In [None]:
from sklearn import metrics
results = {}
data_frames = {}
for p in clf_proc: 
    print('Trying: ', p)
    gs_clf = GridSearchCV(clf_proc[p]['pipeline'], clf_proc[p]['params'], n_jobs=-1)
    gs_clf.fit(train[data_key[ds]['text']], list(train[data_key[ds]['class']]))  
    print('best params:', gs_clf.best_params_)
    print('best score:', gs_clf.best_score_)
    predicted = gs_clf.predict(test[data_key[ds]['text']])
    print('mean error: ', np.mean(predicted == test[data_key[ds]['class']]))
    
    print(metrics.classification_report(np.array(test[data_key[ds]['class']]), predicted))
    results[p] = metrics.confusion_matrix(test[data_key[ds]['class']], predicted)
    data_frames[p] = pd.DataFrame(data={'Text': test[data_key[ds]['text']], 'True Category': test[data_key[ds]['class']], 'Predicted Category': predicted})
    print()

In [None]:
for d in data_frames: 
    data_frames[d].to_csv(d+'_'+ds+'_results.csv', index=False)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
keys = ['SGDClassifier', 'NaiveBayes', 'LinearSVC']

# Confusion Matrices

In [None]:
idx = 0
plt.figure()
classes = sorted(set(test[data_key[ds]['class']]))
plot_confusion_matrix(results[keys[idx]], classes=classes, title=keys[idx]+' Confusion Matrix of ' + data_key[ds]['name'], normalize=True)
plt.tight_layout()
plt.show()

In [None]:
idx = 1
plt.figure()
classes = sorted(set(test[data_key[ds]['class']]))
plot_confusion_matrix(results[keys[idx]], classes=classes, title=keys[idx]+' Confusion Matrix of ' + data_key[ds]['name'], normalize=True)
plt.tight_layout()
plt.show()

In [None]:
idx = 2
plt.figure()
classes = sorted(set(test[data_key[ds]['class']]))
plot_confusion_matrix(results[keys[idx]], classes=classes, title=keys[idx]+' Confusion Matrix of ' + data_key[ds]['name'], normalize=True)
plt.tight_layout()
plt.show()

# Examples of misclassification 

In [None]:
def print_misclassified(idx, samples):
    print('Errors for ', keys[idx])
    f = data_frames[keys[idx]]
    f = f[f['True Category'] != f['Predicted Category']]
    unique = f['True Category'].unique()
    for u in unique: 
        res = f[f['True Category'] == u]
        count = 0
        for i, row in res.iterrows(): 
            if count > (samples-1): 
                break
            print ('Predicted:',row['Predicted Category'], 'Truth:', row['True Category'], 'Text: \'', row['Text'],'\'')
            count +=1


    

    

In [None]:
print_misclassified(0, 1)

In [None]:
print_misclassified(1, 1)

In [None]:
print_misclassified(2, 1)