In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter("ignore")
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sklearn
import tensorflow as tf
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources
import os

os.environ["CUDA_VISIBLE_DEVICES"] = ""
from sklearn.model_selection import train_test_split

In [2]:
def get_articles(name):
    dictio = {}
    list_of_files = glob.glob('../data/news/'+name+'/*.txt')        
    for i,file_name in enumerate(list_of_files):
        id_article = file_name[:-4].split('/')[4]
        dictio[int(id_article)] = open(file_name,'r').read()
    return dictio


def get_words(content):
    vectorizer = TfidfVectorizer(min_df=1,lowercase=False,stop_words='english')
    vectorizer.fit_transform(content.split('.'))
    return vectorizer.get_feature_names()

def get_features(df,vocab=None):
    df['len_content'] = df['content'].apply(lambda x : len(x))
    
    df['len_title'] = df['content'].apply(lambda x: len(x.split('\n')[0]))
    global_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',encoding='utf-8', input='content',
                                        lowercase=True, max_df=0.75, max_features=None, min_df=8,
                                        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
                                        stop_words='english', strip_accents=None, sublinear_tf=False,
                                        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                                        vocabulary=None)
    global_vectorizer.fit_transform(df['content'])
    pertinent_vocab = global_vectorizer.get_feature_names()
    
    article_vocab = [get_words(content) for content in df['content']]
    df['nb_relevant'] = [len (set(vocab) & set(pertinent_vocab)) for vocab in article_vocab]
    
    golbal_count_vectorizer = CountVectorizer(min_df=6,max_df=0.75,stop_words='english',max_features=400,vocabulary=vocab)
    X = golbal_count_vectorizer.fit_transform(df['content'])
    df_words = pd.DataFrame(X.toarray(),index=df.index,columns=golbal_count_vectorizer.get_feature_names())
    df_words['len_content'], df_words['nb_relevant'],df_words['len_title'] = df['len_content'],df['nb_relevant'],df['len_title']
    
    df_words['nb_words'] = [len(vocab) for vocab in article_vocab]
    upper_case_words = []
    for vocab in article_vocab:
        upper_case_words.append(len([word for word in vocab if word.isupper()]))
    df_words['nb_uppercase_words'] = upper_case_words
    
    i = df_words.index
    df_words['nb_entity'] =[0]*len(df_words)
    for index,vocab in zip(i,article_vocab):
        for word in vocab:
            if (any(x.isupper() for x in word)) or word.isupper():
                df_words['nb_entity'].loc[index] += 1
    return df_words

def get_y_train():
    articles = open('../data/labels_training.txt','r').readlines()[1:]
    id_articles = [int(art.split(',')[0]) for art in articles]
    return pd.DataFrame([int(art.split(',')[1][:-1]) for art in articles],columns=['fake_news'],index=id_articles)


In [3]:
rough_data = get_articles('training')

features = get_features(pd.DataFrame.from_dict(rough_data,orient='index',columns=['content']))
targets = get_y_train().sort_index()

In [4]:
tf.reset_default_graph()

X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size = 0.25, random_state = 0)

# Parameters
num_steps = 100 # Total steps to train
num_classes = 2 
num_features = features.shape[1]
num_trees = 10 
max_nodes = 1000 

# Input and Target placeholders 
X = tf.placeholder(tf.float32, shape=[None,X_train.shape[1]])
Y = tf.placeholder(tf.int64, shape=[None,1])

# Random Forest Parameters
hparams = tensor_forest.ForestHParams(num_classes=num_classes, num_features=num_features, num_trees=num_trees, max_nodes=max_nodes).fill()

# Build the Random Forest
forest_graph = tensor_forest.RandomForestGraphs(hparams)

# Get training graph and loss
train_op = forest_graph.training_graph(X, Y)
loss_op = forest_graph.training_loss(X, Y)

# Measure the accuracy
infer_op, _, _ = forest_graph.inference_graph(X)
correct_prediction = tf.equal(tf.argmax(infer_op, 1), tf.cast(Y, tf.int64))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Initialize the variables (i.e. assign their default value) and forest resources
init_vars = tf.group(tf.global_variables_initializer(), resources.initialize_resources(resources.shared_resources()))
  
# Start TensorFlow session
sess = tf.Session()

# Run the initializer
sess.run(init_vars)

# Training
for i in range(1, num_steps + 1):
    _, l = sess.run([train_op, loss_op], feed_dict={X: X_train, Y: y_train})
    if i % 50 == 0 or i == 1:
        acc = sess.run(accuracy_op, feed_dict={X: X_train, Y: y_train})
        print('Step %i, Loss: %f, Acc: %f' % (i, l, acc))
        
# Test Model
print("Test Accuracy:", sess.run(accuracy_op, feed_dict={X: X_test, Y: y_test}))

INFO:tensorflow:Constructing forest with params = 
INFO:tensorflow:{'num_trees': 10, 'max_nodes': 1000, 'bagging_fraction': 1.0, 'feature_bagging_fraction': 1.0, 'num_splits_to_consider': 20, 'max_fertile_nodes': 0, 'split_after_samples': 250, 'valid_leaf_threshold': 1, 'dominate_method': 'bootstrap', 'dominate_fraction': 0.99, 'model_name': 'all_dense', 'split_finish_name': 'basic', 'split_pruning_name': 'none', 'collate_examples': False, 'checkpoint_stats': False, 'use_running_stats_method': False, 'initialize_average_splits': False, 'inference_tree_paths': False, 'param_file': None, 'split_name': 'less_or_equal', 'early_finish_check_every_samples': 0, 'prune_every_samples': 0, 'num_classes': 2, 'num_features': 406, 'bagged_num_features': 406, 'bagged_features': None, 'regression': False, 'num_outputs': 1, 'num_output_columns': 3, 'base_random_seed': 0, 'leaf_model_type': 0, 'stats_model_type': 0, 'finish_type': 0, 'pruning_type': 0, 'split_type': 0}
Instructions for updating:
Coloca