In [None]:
# Base librairies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import warnings
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
warnings.simplefilter("ignore")

# Content extraction
import glob

# Prédiction librairies
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import  RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

# Network part
import networkx as nx

In [None]:
def get_articles(name):
    dictio = {}
    list_of_files = glob.glob('../data/news/'+name+'/*.txt')        
    for i,file_name in enumerate(list_of_files):
        id_article = file_name[:-4].split('/')[4]
        dictio[int(id_article)] = open(file_name,'r').read()
    return dictio

def get_words(content):
    vectorizer = TfidfVectorizer(min_df=1,lowercase=False,stop_words='english')
    vectorizer.fit_transform(content.split('.'))
    return vectorizer.get_feature_names()

def get_features(df,vocab=None):
    df['len_content'] = df['content'].apply(lambda x : len(x))
    
    df['len_title'] = df['content'].apply(lambda x: len(x.split('\n')[0]))
    global_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',encoding='utf-8', input='content',
                                        lowercase=True, max_df=0.75, max_features=None, min_df=8,
                                        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
                                        stop_words='english', strip_accents=None, sublinear_tf=False,
                                        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                                        vocabulary=None)
    global_vectorizer.fit_transform(df['content'])
    pertinent_vocab = global_vectorizer.get_feature_names()
    
    article_vocab = [get_words(content) for content in df['content']]
    df['nb_relevant'] = [len (set(vocab) & set(pertinent_vocab)) for vocab in article_vocab]
    
    golbal_count_vectorizer = CountVectorizer(min_df=6,max_df=0.75,stop_words='english',max_features=400,vocabulary=vocab)
    X = golbal_count_vectorizer.fit_transform(df['content'])
    df_words = pd.DataFrame(X.toarray(),index=df.index,columns=golbal_count_vectorizer.get_feature_names())
    df_words['len_content'], df_words['nb_relevant'],df_words['len_title'] = df['len_content'],df['nb_relevant'],df['len_title']
    
    df_words['nb_words'] = [len(vocab) for vocab in article_vocab]
    upper_case_words = []
    for vocab in article_vocab:
        upper_case_words.append(len([word for word in vocab if word.isupper()]))
    df_words['nb_uppercase_words'] = upper_case_words
    
    i = df_words.index
    df_words['nb_entity'] =[0]*len(df_words)
    for index,vocab in zip(i,article_vocab):
        for word in vocab:
            if (any(x.isupper() for x in word)) or word.isupper():
                df_words['nb_entity'].loc[index] += 1
                
    return df_words,golbal_count_vectorizer.get_feature_names()
    

def get_Y_train():
    articles = open('../data/labels_training.txt','r').readlines()[1:]
    id_articles = [int(art.split(',')[0]) for art in articles]
    return pd.DataFrame([int(art.split(',')[1][:-1]) for art in articles],columns=['fake_news'],index=id_articles)

def rmse(y_true, y_pred):
    return metrics.mean_squared_error(y_true, y_pred) ** 0.5

### Get articles for training and testing

In [None]:
rough_train = get_articles('training')
rough_test = get_articles('test')

### Transform to dataframe

In [None]:
train = pd.DataFrame.from_dict(rough_train,orient='index',columns=['content'])
test = pd.DataFrame.from_dict(rough_test,orient='index',columns=['content'])

### Features engineering

In [None]:
df_train,vocab= get_features(train)
df_test,vocab = get_features(test,vocab)

In [None]:
df_train.tail()

### Some graphs

In [None]:
plt.figure(0)
plt.plot(df_train['len_content'],'.')
plt.title('Number of caracters per article')
plt.xlabel('Article id')
plt.ylabel('Nb caracters')

plt.figure(1)
plt.plot(df_train['nb_relevant'],'.')
plt.title('Number of relevant words per article')
plt.xlabel('Article id')
plt.ylabel('Nb relevants words')

### Best parameters (pipeline)

In [None]:
""" TfIdfVectorizer parameter"""
max_dfs = [0.75]
min_dfs = [7]
ngrams_range = [(1, 1), (1, 2)]


"""Random Forest parameters"""
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 400, stop = 1600, num = 4)]
# Number of features to consider at every split
max_features = ['auto','sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(25, 100, num = 4)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2]


pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', RandomForestClassifier()
    ),
])
parameters = {
    'tfidf__max_df': max_dfs,
    'tfidf__min_df': min_dfs,
    'tfidf__ngram_range': ngrams_range,
    'clf__n_estimators': n_estimators,
    'clf__max_features': max_features,
    'clf__max_depth': max_depth,
    'clf__min_samples_split': min_samples_split,
    'clf__min_samples_leaf': min_samples_leaf
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=5, n_jobs=2, verbose=3)
grid_search_tune.fit( train.sort_index()['content'], get_Y_train().sort_index())

print (grid_search_tune.best_estimator_.steps)

### It's prediction time !

In [None]:
X_train = df_train.sort_index()
Y_train = get_Y_train().sort_index()
X_test = df_test

In [None]:
# real y-train distribution
sns.distplot(Y_train)

### Cross validation

In [None]:
#Based on a model and best parameters, get a cross val score
clf = RandomForestClassifier(n_estimators= 1600,min_samples_split= 2,min_samples_leaf=1,max_features='auto',max_depth=70,bootstrap=False)
np.mean(cross_val_score(clf,X_train,Y_train,cv=10))

### Real prediction

In [None]:
clf = RandomForestClassifier(n_estimators= 1000,min_samples_split= 5,min_samples_leaf=2,max_features='sqrt',max_depth=100,bootstrap=False)
clf.fit(X_train,Y_train)
Y_test = clf.predict(X_test)

### Create a submission file

In [None]:
now = str(datetime.datetime.now())[:19]
with open('../data/submissions/bogota_mates_submission_'+now+'.txt', 'w') as f:
    f.write('doc,class\n')
    for value,doc in zip(Y_test,X_test.index):
        f.write(str(doc)+ ',' + str(value)+"\n")

## NETWORK

In [None]:
f = open('../data/newsUser.txt','r')
nb_news = pd.DataFrame([line.split() for line in f.readlines()],columns=['id_article','id_user','weight'])
f = open('../data/UserUser.txt','r')
follows = pd.DataFrame([line.split() for line in f.readlines()],columns=['follower','followed'])

In [None]:
nb_news['id_article'][0]

In [None]:
g = nx.Graph()
for i,val in enumerate(nb_news['id_article']):
    g.add_edges_from([(nb_news['id_article'][i],nb_news['id_user'][i])],weight=nb_news['weight'])

In [None]:
degree = pd.Series([node[1] for node in g.degree])
sns.distplot(degree,kde=False,bins=20)
plt.show()

In [None]:
# Degree
degree_centrality = nx.degree_centrality(g)

In [None]:
degree_centrality