In [1]:
import json
import requests
import pandas as pd
import numpy as np
import networkx as nx
import nxpd
import matplotlib.pyplot as plt
import matplotlib
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import dataframe
import graphframes


from src.data_reader import DataReader, RetractionFinder
from src.to_gexf import to_gexf
from src.get_redacted import get_paper, load_redacted, get_doi, gen_retracted
import src.load_data as load_data

In [None]:
df_full = load_data.load_dataframe(source='data/retracted_articles', limit=-1)
df = load_data.format_dataframe(df_full)

# EDA

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import log_loss, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [None]:
X,y = df.loc[:, ['numInCitations','numAuthors', 'numEntities', 'year', 'pageLength']], df['retracted']
X,y = df.drop('retracted', axis=1), df['retracted']
X0,Xt,y0,yt = train_test_split(X,y)

In [None]:
rf = RandomForestClassifier(criterion='entropy', max_features='sqrt', 
                            max_depth=2, n_jobs=-1, n_estimators=1000,
                            random_state=5476)
gbc = GradientBoostingClassifier(max_features='sqrt',
                                 max_depth=2, n_estimators=1000,
                                 random_state=5476)

rf.fit(X0, y0)
gbc.fit(X0, y0)

In [None]:
print(log_loss(y_pred=rf.predict_proba(Xt)[:,1], y_true=yt))
print(log_loss(y_pred=gbc.predict_proba(Xt)[:,1], y_true=yt))

In [None]:
fpr, tpr, thr = roc_curve(y_score=gbc.predict_proba(Xt)[:,1], y_true=yt)
fig, (ax0, ax1) = plt.subplots(1,2, figsize=(15,7))
ax0.plot(fpr, tpr)
ax1.bar(x=X.columns, height=gbc.feature_importances_)

In [None]:
def plot_proportions(X, y, lbl):
    xx, yy = [], []
    for val in np.unique(X[lbl]):
        xx.append(val)
        y_hat = y[X[lbl] == val]
        yy.append(y_hat.sum()/len(y_hat))
    return xx,yy

In [None]:
plt.subplots(figsize=(20,6))
plt.subplot('231')
plt.scatter(X['year'], y + np.random.normal(loc=0, scale=0.05, size=len(y)), alpha=0.02)
plt.plot(*plot_proportions(X,y,'year'), c='orange')
plt.title('year')

plt.subplot('232')
plt.scatter(X['numAuthors'], y + np.random.normal(loc=0, scale=0.05, size=len(y)), alpha=0.02)
plt.plot(*plot_proportions(X,y,'numAuthors'), c='orange')
plt.xlim(0,50)
plt.title('numAuthors')

plt.subplot('233')
plt.scatter(X['numEntities'], y + np.random.normal(loc=0, scale=0.05, size=len(y)), alpha=0.02)
plt.plot(*plot_proportions(X,y,'numEntities'), c='orange')
plt.xlim(0,50)
plt.title('numEntities')

plt.subplot('234')
plt.scatter(X['numInCitations'], y + np.random.normal(loc=0, scale=0.05, size=len(y)), alpha=0.02)
plt.plot(*plot_proportions(X,y,'numInCitations'), c='orange')
plt.xlim(0,50)
plt.title('numInCitations')

plt.subplot('235')
plt.scatter(X['pageLength'], y + np.random.normal(loc=0, scale=0.05, size=len(y)), alpha=0.02)
plt.plot(*plot_proportions(X,y,'pageLength'), c='orange')
plt.title('pageLength')
plt.xlim(0,50)


In [None]:
has_p_value = df_full['paperAbstract'].apply(lambda x: any(p in x.lower() for p in ('pvalue','p-value', 'p value')))

In [None]:
print(len(df_full['paperAbstract'][has_p_value & df['retracted']]), sum(df['retracted']))
print(len(df_full['paperAbstract'][has_p_value & df['retracted']])/sum(df['retracted']))
print(np.std([has_p_value & df['retracted']]))

In [None]:
print(len(df_full['paperAbstract'][has_p_value & ~df['retracted']]), sum(~df['retracted']))
print(len(df_full['paperAbstract'][has_p_value & ~df['retracted']])/sum(~df['retracted']))
print(np.std([has_p_value & ~df['retracted']]))

In [None]:
df_full.columns

In [None]:
abstract_tfidf = TfidfVectorizer()
abstract_tfidf.fit(X0['paperAbstract'])
abstract_train = abstract_tfidf.transform(X0['paperAbstract'])
abstract_test = abstract_tfidf.transform(Xt['paperAbstract'])

abstract_bayes = MultinomialNB()
abstract_bayes.fit(abstract_train, y0)
log_loss(y_pred=abstract_bayes.predict_proba(abstract_test)[:,1], y_true=yt)

In [None]:
title_tfidf = TfidfVectorizer()
title_tfidf.fit(X0['title'])
title_train = title_tfidf.transform(X0['title'])
title_test = title_tfidf.transform(Xt['title'])


title_bayes = MultinomialNB()
title_bayes.fit(title_train, y0)
log_loss(y_pred=title_bayes.predict_proba(title_test)[:,1], y_true=yt)

# Export to Graph File

In [2]:
reader = DataReader('data/retracted_articles')
reader.write(lim=10000, dynamic=True)

In [3]:
edges = spark.createDataFrame(reader.read_edges(), schema=StructType(
[
    StructField(name='src', dataType=StringType()),
    StructField(name='dst', dataType=StringType()),
    StructField(name='year', dataType=StringType())
]
)).withColumn('id', F.monotonically_increasing_id())


nodes = spark.createDataFrame(reader.read_nodes(), schema=StructType(
[
    StructField(name='id', dataType=StringType()),
    StructField(name='year', dataType=StringType()),
    StructField(name='authors', dataType=StringType())
]
))

In [4]:
g = graphframes.GraphFrame(e=edges, v=nodes)
to_gexf(g, 'data/graph.gexf', dynamic=True)

In [37]:
with smart_open('data/zipped/s2-corpus-00.gz', 'r') as f:
    gz = pd.read_json(f, lines=-1)

In [53]:
globals().keys()

