In [None]:
import com_func
import pandas as pd

import warnings

Dataset = "pubmed"

warnings.simplefilter(action='ignore', category=FutureWarning)

# parameters
threshold = 30
cutoff = 3

pp_textual_emb_type = ["tf_idf","lsa"]

In [None]:
# load text information
Dataset = "pubmed"
raw_filepath = "../Data/"+Dataset+"/id_textual_combined.txt"
all_text_content = []
with open(raw_filepath, 'r', encoding = 'utf8') as f:
    # items[0] is paper ID, items[1] is title, items[2] is abstract
    for line in f:
        items = line.split("\t")
        # lower case all character
        paperID = items[0]
        title = items[1].lower()
        keywords = items[2].lower()
        mesh = items[3].lower()
        abstract = items[4].lower()
        # textual information can be defined as all feature combined
        content = title+" "+keywords+" "+mesh+" "+abstract
        paper_text_content = {"paperID": paperID, "combine_textual":content}
        all_text_content.append(paper_text_content)
print("Total ", len(all_text_content), " paper have text information")
# convert to dataframe so it's easy to process
all_text_content = pd.DataFrame(all_text_content)

In [None]:
def dummy(doc):
    return doc
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7]}
                LabeledRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [None]:
# remove author(positive sample) from other(negative sample)
import random
def extractNegativeSample(positiveSample, allSample):
    negativeSample = [x for x in allSample if x not in positiveSample]
    return negativeSample

In [None]:
def LSA(cleaned_token, dim=100):
    # Tf-idf Transformation
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
    tfidfMatrix = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
    if(tfidfMatrix.shape[1]<dim):
        dim = tfidfMatrix.shape[1] -1
    # tf-idf + svd
    svd = TruncatedSVD(n_components=dim)
    final_lsa_Matrix = svd.fit_transform(tfidfMatrix)
    return final_lsa_Matrix

In [None]:
from sklearn.preprocessing import normalize
# document relation wrt textual content
# convert raw text to numerical feature vectors
# bow(Bags of words) are used with uni-gram setting
def raw_text_to_vector(raw_textual_content, emb_type="off", stopword=True):
    cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
    average_token_size = sum(sample_size)/len(sample_size)
    print("Minimal token size: ", min(sample_size))
    print("maximal token size: ", max(sample_size))
    while True:
        if emb_type == "tf_idf":
            # using tf-idf
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
            #print(len(tfidf_vectorizer.vocabulary_))
            #print(tfidf_vectorizer.get_feature_names())
            break
        elif emb_type == "tf":
            # Document-Term frequence Matrix
            count_vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, min_df=cutoff)
            result_vector = normalize(count_vectorizer.fit_transform(cleaned_token).toarray())
            break
        elif emb_type == "lsa":
            # use lsa
            result_vector = LSA(cleaned_token, dim=100)
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector, average_token_size

In [None]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv(data, label, clf, k=10):
    kf = KFold(n_splits=k, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data[train_index], data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # fit data to clf
        clf.fit(data_train, label_train)
        # get predicted label
        label_pred = clf.predict(data_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(label_pred)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    return accuracy, f1

In [None]:
# load the file
import sys
import io
import os
import collections
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# fix random seed for reproducibility
np.random.seed(1)

fileDir = "../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

for embedding in pp_textual_emb_type:
    # collect statistic to output
    allname = []
    average_token_size = []
    positive_sample_size = []
    negative_sample_size = []

    all_LR_accuracy = []
    all_LR_f1 = []
    all_svcLinear_accuracy = []
    all_svcLinear_f1 = []

    # read all file in labeled group
    for file in listfiles:
        # group name
        temp = file.split("_")
        name = temp[1]+"_"+temp[-1]
        print("For name: ",name)
        # read needed content in labeled file
        labeled_data = read_labeled_file(fileDir+file)
        # merge textual from all raw data to labeled dataset
        labeled_data = pd.merge(left=labeled_data,right=all_text_content, how='left', left_on='paperID', right_on='paperID')
        # collect all labeled sample
        all_labeled_sample = labeled_data["paperID"].tolist()
        print("total sample size before apply threshold: ",len(labeled_data))
        # count number of paper each author write based on author ID
        paperCounter = collections.Counter(labeled_data["authorID"])
        print(paperCounter)
        # collect per class statistic
        for k in list(paperCounter):
            if paperCounter[k] < threshold:
                del paperCounter[k]
        temp =list(paperCounter.keys())
        print(temp)
        # remove authors that write smaller than threshold number of authors
        temp = labeled_data[labeled_data.authorID.isin(temp)]
        author_list = set(temp["authorID"])
        # if only have one class or no class pass the threshold, not applicable
        if(len(paperCounter)==0) or (len(paperCounter)==1):
            print(name," pass")
        else:
            counter = 0
            # loop through each author have label, one vs rest
            for author in author_list:
                author_name = name+'_'+str(counter)
                allname.append(author_name)
                print(author_name)
                mask = labeled_data["authorID"] == author
                temp = labeled_data[mask]
                positive_sample_pid = temp["paperID"].tolist()
                negative_sample_pid = extractNegativeSample(positive_sample_pid, all_labeled_sample)
                # append to statistic collection
                positive_sample_size.append(len(positive_sample_pid))
                negative_sample_size.append(len(negative_sample_pid))
                # form positive and negative (negative class come from similar name group)
                all_authors = []
                all_authors.append(positive_sample_pid)
                all_authors.append(negative_sample_pid)
                appended_data = []
                for label, pid in enumerate(all_authors):
                    # create df save one author data 
                    authordf = pd.DataFrame({"paperID":pid})
                    authordf['label'] = label
                    appended_data.append(authordf)
                processed_data = pd.concat(appended_data, axis=0,ignore_index=True)
                # shuffle the data
                processed_data = processed_data.sample(frac=1).reset_index(drop=True)
                # extract true label and it's corresponeding pid for matching
                label = processed_data["label"]
                pid = processed_data["paperID"]
                
                # alignment
                labeled_data = pd.merge(processed_data, labeled_data, on="paperID")

                # list of different data field
                part_collection = []
                # select feature wanted to fit to clustering/classification algorithm
                data_textual, data_token_size = raw_text_to_vector(labeled_data["combine_textual"], emb_type=embedding)
                
                average_token_size.append(data_token_size)
                print(data_textual.shape)
                part_collection.append(data_textual)
                # merge different part of data data together by concatenate it all together
                # remove empty emb (when emb set off)
                part_collection = [part for part in part_collection if len(part)!=0]
                print(len(part_collection))
                if len(part_collection)>1:
                    combinedata = np.concatenate(part_collection,axis=1)
                elif len(part_collection)==1:
                    if isinstance(part_collection[0], pd.DataFrame):
                        combinedata = part_collection[0].values
                    else:
                        combinedata = part_collection[0]
                else:
                    print("No data available")
                    break
                print(combinedata.shape)
                # using converted feature vector to train classifier
                # using logistic regression
                clf = LogisticRegression(class_weight="balanced")
                LRaccuracy, LRmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
                print("LR Accuracy: ",LRaccuracy)
                print("LR F1: ", LRmarcof1)
                all_LR_accuracy.append(LRaccuracy)
                all_LR_f1.append(LRmarcof1)
                # using SVM with linear kernal
                clf = SVC(kernel='linear',class_weight="balanced")
                svcaccuracy, svcmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
                print("svc Accuracy: ",svcaccuracy)
                print("svc F1: ", svcmarcof1)
                all_svcLinear_accuracy.append(svcaccuracy)
                all_svcLinear_f1.append(svcmarcof1)
                counter+=1
    # write evaluation result to excel
    output = pd.DataFrame({'Author Name':allname, "sample average token":average_token_size,
                           "positive sample size":positive_sample_size,"negative sample size":negative_sample_size, 
                           "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) f1": all_svcLinear_f1, 
                           "logistic regression accuracy":all_LR_accuracy, "logistic regression f1": all_LR_f1})

    savePath = "../result/"+Dataset+"/binary_local_emb/"
    if not os.path.exists(savePath):
        os.makedirs(savePath)
    filename = "textual="+embedding+"_threshold="+str(threshold)+".csv"
    output.to_csv(savePath+filename, encoding='utf-8',index=False)

Minimal token size:  22
maximal token size:  302
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f5865d60e18>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f5865d60e18>, use_idf=True,
        vocabulary=None)
(148, 1295)
1
(148, 1295)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00       108

   micro avg       1.00      1.00      1.00       148
   macro avg       1.00      1.00      1.00       148
weighted avg       1.00      1.00      1.00       148

[ 40   0   0 108]
LR Accuracy:  1.0
LR F1:  1.0
              precision    r

              precision    recall  f1-score   support

           0       0.96      0.96      0.96        53
           1       0.99      0.99      0.99       150

   micro avg       0.98      0.98      0.98       203
   macro avg       0.97      0.97      0.97       203
weighted avg       0.98      0.98      0.98       203

[ 51   2   2 148]
svc Accuracy:  0.9802955665024631
svc F1:  0.9744654088050315
For name:  j_sampaio
total sample size before apply threshold:  117
Counter({'0000-0003-2335-9991': 61, '0000-0001-8145-5274': 48, '0000-0003-4359-493X': 5, '0000-0002-0460-3664': 3})
['0000-0003-2335-9991', '0000-0001-8145-5274']
j_sampaio_0
Minimal token size:  7
maximal token size:  233
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f5865d60e18

              precision    recall  f1-score   support

           0       1.00      0.90      0.95        30
           1       0.98      1.00      0.99       120

   micro avg       0.98      0.98      0.98       150
   macro avg       0.99      0.95      0.97       150
weighted avg       0.98      0.98      0.98       150

[ 27   3   0 120]
svc Accuracy:  0.98
svc F1:  0.967511371020143
For name:  j_alexander
total sample size before apply threshold:  31
Counter({'0000-0001-6783-4382': 11, '0000-0003-2226-7913': 10, '0000-0002-2258-5738': 5, '0000-0001-9797-6322': 2, '0000-0002-6492-1621': 2, '0000-0001-7734-9428': 1})
[]
j_alexander  pass
For name:  j_schneider
total sample size before apply threshold:  40
Counter({'0000-0001-8016-8687': 13, '0000-0002-6028-9956': 7, '0000-0003-1114-618X': 5, '0000-0001-7169-3973': 5, '0000-0003-1176-8309': 3, '0000-0001-5187-6756': 3, '0000-0002-5863-7747': 1, '0000-0001-6093-5404': 1, '0000-0001-5556-0919': 1, '0000-0001-9610-6501': 1})
[]
j_schne

              precision    recall  f1-score   support

           0       0.97      0.86      0.91        35
           1       0.97      0.99      0.98       142

   micro avg       0.97      0.97      0.97       177
   macro avg       0.97      0.93      0.94       177
weighted avg       0.97      0.97      0.97       177

[ 30   5   1 141]
LR Accuracy:  0.9661016949152542
LR F1:  0.9441287878787878
              precision    recall  f1-score   support

           0       1.00      0.86      0.92        35
           1       0.97      1.00      0.98       142

   micro avg       0.97      0.97      0.97       177
   macro avg       0.98      0.93      0.95       177
weighted avg       0.97      0.97      0.97       177

[ 30   5   0 142]
svc Accuracy:  0.9717514124293786
svc F1:  0.9528879425073196
m_lewis_2
Minimal token size:  11
maximal token size:  345
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', in

              precision    recall  f1-score   support

           0       0.96      0.95      0.96        57
           1       0.97      0.98      0.97        85

   micro avg       0.96      0.96      0.96       142
   macro avg       0.96      0.96      0.96       142
weighted avg       0.96      0.96      0.96       142

[54  3  2 83]
svc Accuracy:  0.9647887323943662
svc F1:  0.9632562231537545
For name:  a_sinclair
total sample size before apply threshold:  109
Counter({'0000-0003-2741-7992': 64, '0000-0001-8510-8691': 31, '0000-0002-2628-1686': 9, '0000-0002-5602-5958': 5})
['0000-0003-2741-7992', '0000-0001-8510-8691']
a_sinclair_0
Minimal token size:  10
maximal token size:  301
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f5865d60e18>

              precision    recall  f1-score   support

           0       0.95      0.97      0.96        39
           1       1.00      0.99      0.99       223

   micro avg       0.99      0.99      0.99       262
   macro avg       0.97      0.98      0.98       262
weighted avg       0.99      0.99      0.99       262

[ 38   1   2 221]
LR Accuracy:  0.9885496183206107
LR F1:  0.9776418717109943
              precision    recall  f1-score   support

           0       0.97      0.92      0.95        39
           1       0.99      1.00      0.99       223

   micro avg       0.98      0.98      0.98       262
   macro avg       0.98      0.96      0.97       262
weighted avg       0.98      0.98      0.98       262

[ 36   3   1 222]
svc Accuracy:  0.9847328244274809
svc F1:  0.9692199248120301
t_o'brien_1
Minimal token size:  9
maximal token size:  297
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', i

              precision    recall  f1-score   support

           0       1.00      0.95      0.98        64
           1       0.97      1.00      0.98        90

   micro avg       0.98      0.98      0.98       154
   macro avg       0.98      0.98      0.98       154
weighted avg       0.98      0.98      0.98       154

[61  3  0 90]
svc Accuracy:  0.9805194805194806
svc F1:  0.9798032786885246
a_james_1
Minimal token size:  12
maximal token size:  339
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f5865d60e18>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f5865d60e18>, use_idf=True,
        vocabulary=None)
(154, 1465)
1
(1

Minimal token size:  7
maximal token size:  398
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f5865d60e18>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f5865d60e18>, use_idf=True,
        vocabulary=None)
(384, 2955)
1
(384, 2955)
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        44
           1       0.99      1.00      1.00       340

   micro avg       0.99      0.99      0.99       384
   macro avg       1.00      0.98      0.99       384
weighted avg       0.99      0.99      0.99       384

[ 42   2   0 340]
LR Accuracy:  0.9947916666666666
LR F1:  0.9869058173634317

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       129
           1       0.98      1.00      0.99        87

   micro avg       0.99      0.99      0.99       216
   macro avg       0.99      0.99      0.99       216
weighted avg       0.99      0.99      0.99       216

[127   2   0  87]
svc Accuracy:  0.9907407407407407
svc F1:  0.9904119318181819
For name:  p_antunes
total sample size before apply threshold:  41
Counter({'0000-0002-3553-2678': 25, '0000-0003-3324-4151': 10, '0000-0001-9129-3539': 5, '0000-0003-1969-1860': 1})
[]
p_antunes  pass
For name:  x_yuan
total sample size before apply threshold:  71
Counter({'0000-0002-1632-8460': 38, '0000-0002-8063-9431': 13, '0000-0001-5395-9109': 11, '0000-0002-2891-1354': 5, '0000-0002-6900-6983': 2, '0000-0001-6983-7368': 1, '0000-0001-7280-7207': 1})
['0000-0002-1632-8460']
x_yuan  pass
For name:  t_kim
total sample size before apply threshold:  568
Counter({'0000-0003-4982-4441':

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       109
           1       0.98      0.97      0.98       459

   micro avg       0.96      0.96      0.96       568
   macro avg       0.94      0.95      0.94       568
weighted avg       0.97      0.96      0.97       568

[101   8  12 447]
svc Accuracy:  0.9647887323943662
svc F1:  0.9440140359177558
For name:  a_cruz
total sample size before apply threshold:  80
Counter({'0000-0002-0465-4111': 38, '0000-0002-8251-8422': 13, '0000-0002-1662-3072': 10, '0000-0003-0368-9731': 9, '0000-0003-4537-1318': 7, '0000-0002-4591-4362': 3})
['0000-0002-0465-4111']
a_cruz  pass
For name:  a_mora
total sample size before apply threshold:  84
Counter({'0000-0002-0785-5795': 54, '0000-0002-6397-4836': 20, '0000-0003-1344-1131': 5, '0000-0003-1354-4739': 3, '0000-0002-9132-5622': 2})
['0000-0002-0785-5795']
a_mora  pass
For name:  j_walker
total sample size before apply threshold:  253
Counter({'000

              precision    recall  f1-score   support

           0       0.98      0.80      0.88        56
           1       0.89      0.99      0.94        90

   micro avg       0.92      0.92      0.92       146
   macro avg       0.93      0.90      0.91       146
weighted avg       0.92      0.92      0.92       146

[45 11  1 89]
LR Accuracy:  0.9178082191780822
LR F1:  0.9095975232198142
              precision    recall  f1-score   support

           0       0.98      0.91      0.94        56
           1       0.95      0.99      0.97        90

   micro avg       0.96      0.96      0.96       146
   macro avg       0.96      0.95      0.96       146
weighted avg       0.96      0.96      0.96       146

[51  5  1 89]
svc Accuracy:  0.958904109589041
svc F1:  0.9559178743961353
For name:  y_tang
total sample size before apply threshold:  66
Counter({'0000-0003-4888-6771': 34, '0000-0003-2718-544X': 17, '0000-0001-9312-1378': 6, '0000-0002-2649-5270': 5, '0000-0002-8807-92

Minimal token size:  6
maximal token size:  299
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f5865d60e18>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f5865d60e18>, use_idf=True,
        vocabulary=None)
(789, 4116)
1
(789, 4116)
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       587
           1       0.93      0.99      0.95       202

   micro avg       0.98      0.98      0.98       789
   macro avg       0.96      0.98      0.97       789
weighted avg       0.98      0.98      0.98       789

[571  16   3 199]
LR Accuracy:  0.9759188846641318
LR F1:  0.9690356242138073

In [None]:
# accuracy
from statistics import mean 
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

In [None]:
# f1
from statistics import mean 
# remove string from result
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))

In [None]:
print(len(listfiles))

In [None]:
%reset

In [None]:
%who