# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


In [2]:
stop_words = {'a', 'ourselves', 'hers', 'between', 'yourself', 
              'but', 'again', 'there', 'about', 'once', 'during', 
              'out', 'very', 'having', 'with', 'they', 'own', 'an', 
              'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 
              'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 
              'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 
              'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 
              'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 
              'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 
              'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 
              'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 
              'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 
              'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 
              'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 
              'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 
              'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than', 's', 'i', 't', 'To'
             'u', '3', '2', '6', '5', '60', 'u'}

# Read data
politifact_real = pd.read_csv('./FakeNewsNet-master/dataset/politifact_real.csv')
politifact_fake = pd.read_csv('./FakeNewsNet-master/dataset/politifact_fake.csv')
gossipcop_real = pd.read_csv('./FakeNewsNet-master/dataset/gossipcop_real.csv')
gossipcop_fake = pd.read_csv('./FakeNewsNet-master/dataset/gossipcop_fake.csv')
len(politifact_real), len(politifact_fake), len(gossipcop_real), len(gossipcop_fake)

politifact_real = politifact_real.sample(432)
gossipcop_real = gossipcop_real.sample(5323)

politifact_real['label'] = 1
politifact_fake['label'] = 0
gossipcop_real['label'] = 1
gossipcop_fake['label'] = 0

politifact = pd.concat((politifact_fake, politifact_real), axis=0)
gossipcop = pd.concat((gossipcop_real, gossipcop_fake), axis=0)

len(politifact), len(gossipcop)

politifact = politifact[['title', 'label']]
gossipcop = gossipcop[['title', 'label']]

# Get rid of the stop words 
def clean_stopwords(sentences):
    """
    input: array of sentences
    """
    word_list = re.findall(r'\w+', sentences)
    ans = ''
    for word in word_list:
        if word.lower() in stop_words:
            continue
        ans += word + ' '
    return ans

politifact['title'] = politifact['title'].map(lambda x: clean_stopwords(x))
gossipcop['title'] = gossipcop['title'].map(lambda x: clean_stopwords(x))

In [3]:
# Balancing the data
print(politifact['label'].value_counts())
print(gossipcop['label'].value_counts())

1    432
0    432
Name: label, dtype: int64
1    5323
0    5323
Name: label, dtype: int64


# Convert the text data to binary encodings

In [84]:
# def train_test_processing(df):
#     """
#     input: dataframe with label
#     """
#     X, y = df.iloc[:, :-1], df.iloc[:, -1]
    
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
#     tokenizer = Tokenizer(num_words=100, lower=False)
    
#     tokenizer.fit_on_texts(X_train['title'])
    
#     X_train = tokenizer.texts_to_sequences(X_train['title'])
    
#     X_test = tokenizer.texts_to_sequences(X_test['title'])
  
#     vocab_size = len(tokenizer.word_index)+1
    
#     X_train = pad_sequences(X_train, padding='post', maxlen=100) 
    
#     X_test = pad_sequences(X_test, padding='post', maxlen=100) 
    
#     return X_train, X_test, y_train, y_test

def train_test_processing(df, feature_prc, tok=None, cold_start=True):
    """
    input: dataframe with label
    """
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
    if cold_start == True:
        tokenizer = Tokenizer(num_words=int(2813*feature_prc), lower=False)
        tokenizer.fit_on_texts(X_train['title'])
    else:
        if not tok:
            print("Please give a tokenizer for hot start")
        else:
            tokenizer = tok
    
    X_train = tokenizer.texts_to_matrix(X_train['title'])
    
    X_test = tokenizer.texts_to_matrix(X_test['title'])

    vocab_size = len(tokenizer.word_index)+1
    
    return X_train, X_test, y_train, y_test, tokenizer

In [85]:
X_train, X_test, y_train, y_test, tok = train_test_processing(politifact, 0.1)

# Model Training

In [93]:
def evaluate(X, feature_prc, cold_start=True):
    X_train, X_test, y_train, y_test, tok = train_test_processing(politifact, feature_prc)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    if cold_start:
        train_predict = clf.predict(X_train)
        test_predict = clf.predict(X_test)
        f1_score_train = f1_score(train_predict, y_train)
        f1_score_test = f1_score(test_predict, y_test)
    else:
        X_train, X_test, y_train, y_test, _ = train_test_processing(X, feature_prc, tok, cold_start = False)
        train_predict = clf.predict(X_train)
        test_predict = clf.predict(X_test)
        f1_score_train = f1_score(train_predict, y_train)
        f1_score_test = f1_score(test_predict, y_test)
    print(f'Training F1: {f1_score_train}')
    print(f'Testing F1: {f1_score_test}')
    
    return f1_score_test

In [87]:
svm_clf_ = SVC(C=1.0, random_state=42)
forest_clf_ = RandomForestClassifier(random_state=42)

In [94]:
evaluate(politifact, feature_prc=0.1)

Training F1: 0.9198813056379822
Testing F1: 0.8200836820083682


0.8200836820083682

In [105]:
evaluate(gossipcop, 0.1, cold_start=False)

Training F1: 0.5840236686390533
Testing F1: 0.576080521018354


0.576080521018354

In [132]:
features = []
for idx, obj in enumerate(forest_clf_.feature_importances_):
    features.append([idx, obj])
features = sorted(features, key=lambda x: -x[1])

In [133]:
features

[[1, 0.06910053298703385],
 [3, 0.060498645223970006],
 [8, 0.020239350348587388],
 [85, 0.012847389898968692],
 [24, 0.011978845175372914],
 [20, 0.010035106981443387],
 [10, 0.009441438603585279],
 [91, 0.009022697505539289],
 [297, 0.00890214655967943],
 [78, 0.007791757543039767],
 [17, 0.0077620585885567655],
 [2, 0.00759782097894701],
 [150, 0.007489999586561195],
 [179, 0.007419724975146397],
 [138, 0.0073189301555248425],
 [42, 0.007147714479788543],
 [173, 0.006686402278087815],
 [34, 0.006664348047799695],
 [300, 0.006637702447623216],
 [25, 0.0065188573095377985],
 [133, 0.00645618893807832],
 [56, 0.006213302649572091],
 [9, 0.00616558867595505],
 [13, 0.0059445069115229236],
 [11, 0.005586932386072079],
 [50, 0.00552743598533852],
 [301, 0.005459524818670966],
 [67, 0.005438390017796526],
 [35, 0.005307736627027283],
 [75, 0.005302827351354776],
 [53, 0.005293162877893991],
 [223, 0.0052844416790775684],
 [119, 0.005251735266156794],
 [77, 0.005132565214868911],
 [4, 0.005

In [134]:
tok.word_index

{'Trump': 1,
 'Obama': 2,
 'BREAKING': 3,
 'Clinton': 4,
 'President': 5,
 'News': 6,
 'New': 7,
 'Transcript': 8,
 'McCain': 9,
 'Donald': 10,
 'Barack': 11,
 'Hillary': 12,
 'says': 13,
 'House': 14,
 'Office': 15,
 'US': 16,
 'Week': 17,
 'United': 18,
 'America': 19,
 'Senator': 20,
 'Paul': 21,
 'Breaking': 22,
 'National': 23,
 'Action': 24,
 'Arrested': 25,
 'Twitter': 26,
 'States': 27,
 'Senate': 28,
 'TV': 29,
 '10': 30,
 'Health': 31,
 'Police': 32,
 'White': 33,
 'Congress': 34,
 'Tax': 35,
 'Says': 36,
 'Outlook': 37,
 'First': 38,
 'Found': 39,
 'State': 40,
 'Remarks': 41,
 'Political': 42,
 'Million': 43,
 'School': 44,
 'Bill': 45,
 'Palin': 46,
 'Pelosi': 47,
 'Democrats': 48,
 'debate': 49,
 'Democratic': 50,
 'Federal': 51,
 'John': 52,
 'R': 53,
 'Ad': 54,
 'Statement': 55,
 'com': 56,
 'Sen': 57,
 'Data': 58,
 'CNN': 59,
 'York': 60,
 'Skype': 61,
 'Bing': 62,
 'Latest': 63,
 'Videos': 64,
 'Republican': 65,
 'Security': 66,
 'People': 67,
 'Fox': 68,
 'Act': 69,


# Propensity Score Matching

### 1. Calculate Propensity Score

For every word, we need to build a logistic regression with other words features.



In [3]:
from models.propensity_score import propensity_score

p_score = propensity_score()
p_score.fit(politifact)

HBox(children=(IntProgress(value=0, max=2990), HTML(value='')))

36 1
['trump', 33.108108108108105]
86 0
['obama', 86.0]
24 0
['breaking', 24.0]
52 0
['president', 52.0]
42 0
['news', 42.0]
26 0
['clinton', 26.0]
66 0
['transcript', 66.0]
32 0
['says', 32.0]
24 0
['new', 24.0]
6 1
['donald', 3.5714285714285716]
48 0
['debate', 48.0]
39 1
['mccain', 36.1]
28 0
['senate', 28.0]
14 0
['hillary', 14.0]
26 0
['bill', 26.0]
26 1
['house', 23.14814814814815]
6 0
['white', 6.0]
26 0
['office', 26.0]
36 0
['remarks', 36.0]
15 0
['first', 15.0]
30 0
['health', 30.0]
8 0
['us', 8.0]
30 0
['presidential', 30.0]
30 0
['care', 30.0]
21 0
['barack', 21.0]
19 0
['national', 19.0]
29 0
['act', 29.0]
19 0
['john', 19.0]
14 0
['gun', 14.0]
18 0
['united', 18.0]
8 0
['state', 8.0]
26 0
['week', 26.0]
26 0
['congress', 26.0]
1 1
['million', 0.0]
6 0
['school', 6.0]
0 1
['arrested', 1.0]
13 0
['america', 13.0]
16 0
['states', 16.0]
14 1
['tax', 11.266666666666667]
25 1
['ad', 22.153846153846153]
22 0
['democratic', 22.0]
24 0
['latest', 24.0]
6 1
['one', 3.57142857142857

0 1
['seth', 1.0]
0 1
['rich', 1.0]
0 0
['source', 0]
0 1
['supporter', 1.0]
0 1
['make', 1.0]
0 1
['think', 1.0]
2 1
['looking', 0.3333333333333333]
2 1
['floor', 0.3333333333333333]
3 1
['four', 1.0]
0 1
['month', 1.0]
1 0
['check', 1.0]
1 1
['immigrants', 0.0]
1 0
['attorney', 1.0]
3 1
['questions', 1.0]
0 1
['marijuana', 1.0]
4 1
['results', 1.8]
0 1
['body', 1.0]
0 1
['north', 1.0]
0 1
['weed', 1.0]
0 1
['next', 1.0]
3 0
['mexico', 3.0]
0 1
['massive', 1.0]
0 0
['smith', 0]
0 0
['great', 0]
3 0
['8', 3.0]
0 0
['hospital', 0]
1 1
['love', 0.0]
0 1
['emma', 1.0]
0 1
['went', 1.0]
1 1
['viral', 0.0]
0 1
['getting', 1.0]
1 0
['picks', 1.0]
4 0
['david', 4.0]
0 1
['affair', 1.0]
2 1
['7', 0.3333333333333333]
2 0
['money', 2.0]
0 1
['star', 1.0]
0 0
['confirmed', 0]
4 0
['jersey', 4.0]
1 1
['reveals', 0.0]
0 0
['liberals', 0]
0 1
['plane', 1.0]
1 1
['dc', 0.0]
0 1
['service', 1.0]
3 1
['praises', 1.0]
0 0
['paid', 0]
1 1
['end', 0.0]
0 0
['ever', 0]
0 0
['smoking', 0]
0 1
['crash', 1.0]

2 0
['officer', 2.0]
2 0
['street', 2.0]
2 0
['dems', 2.0]
0 0
['planning', 0]
2 0
['violence', 2.0]
2 0
['reed', 2.0]
0 0
['begin', 0]
2 0
['gang', 2.0]
2 0
['chicago', 2.0]
2 0
['immigrant', 2.0]
2 0
['let', 2.0]
2 0
['schneiderman', 2.0]
0 0
['lay', 0]
0 0
['barron', 0]
2 0
['finds', 2.0]
2 0
['youtube', 2.0]
2 0
['article', 2.0]
0 0
['sold', 0]
2 0
['lance', 2.0]
2 0
['hannity', 2.0]
2 0
['give', 2.0]
0 0
['courts', 0]
0 0
['australia', 0]
2 0
['accounts', 2.0]
2 0
['legislature', 2.0]
0 0
['travel', 0]
2 0
['enemy', 2.0]
0 0
['meat', 0]
0 0
['accidentally', 0]
2 0
['laura', 2.0]
2 0
['ingraham', 2.0]
2 0
['director', 2.0]
2 0
['filed', 2.0]
2 0
['frequently', 2.0]
2 0
['asked', 2.0]
2 0
['african', 2.0]
2 0
['concerns', 2.0]
2 0
['might', 2.0]
2 0
['begins', 2.0]
2 0
['45', 2.0]
0 0
['raids', 0]
2 0
['late', 2.0]
2 0
['also', 2.0]
2 0
['sets', 2.0]
2 0
['changes', 2.0]
2 0
['radical', 2.0]
2 0
['union', 2.0]
2 0
['protest', 2.0]
2 0
['rally', 2.0]
2 0
['already', 2.0]
2 0
['meant'

0 1
['rico', 1.0]
0 1
['3m', 1.0]
0 1
['dying', 1.0]
0 1
['78', 1.0]
0 1
['marilyn', 1.0]
0 1
['monroe', 1.0]
0 1
['uncovers', 1.0]
0 1
['62', 1.0]
0 1
['agents', 1.0]
0 1
['toothpaste', 1.0]
0 1
['throw', 1.0]
0 1
['away', 1.0]
0 1
['nov', 1.0]
0 1
['28', 1.0]
0 1
['dividing', 1.0]
0 1
['nationality', 1.0]
0 1
['fully', 1.0]
0 0
['blamed', 0]
0 0
['sabotaging', 0]
0 0
['prank', 0]
0 0
['trick', 0]
0 1
['knew', 1.0]
0 1
['murdered', 1.0]
0 1
['staffer', 1.0]
0 1
['covered', 1.0]
0 1
['jennifer', 1.0]
0 1
['aniston', 1.0]
0 1
['decide', 1.0]
0 1
['fight', 1.0]
0 1
['ne', 1.0]
0 0
['officers', 0]
0 0
['firefighters', 0]
0 1
['seal', 1.0]
0 1
['widow', 1.0]
0 1
['attention', 1.0]
0 1
['placed', 1.0]
0 1
['customer', 1.0]
0 1
['step', 1.0]
0 0
['enrollment', 0]
0 1
['g20', 1.0]
0 1
['summit', 1.0]
0 0
['hired', 0]
0 0
['exorcist', 0]
0 0
['cleanse', 0]
0 0
['demons', 0]
0 0
['urgent', 0]
0 0
['charge', 0]
0 0
['fall', 0]
0 0
['river', 0]
0 0
['built', 0]
0 1
['clint', 1.0]
0 1
['eastwood',

0 0
['dad', 0]
0 1
['worked', 1.0]
0 1
['actively', 1.0]
0 1
['campaigning', 1.0]
0 1
['opponent', 1.0]
0 1
['activist', 1.0]
0 1
['mommy', 1.0]
0 1
['erupts', 1.0]
0 1
['sweden', 1.0]
0 1
['irate', 1.0]
0 1
['swedes', 1.0]
0 1
['burn', 1.0]
0 1
['nine', 1.0]
0 0
['mental', 0]
0 0
['images', 0]
0 1
['kurt', 1.0]
0 1
['russel', 1.0]
0 1
['showed', 1.0]
0 1
['overpaid', 1.0]
0 1
['shut', 1.0]
0 1
['hurts', 1.0]
0 1
['case', 1.0]
0 1
['tied', 1.0]
0 1
['era', 1.0]
0 1
['uranium', 1.0]
0 1
['scientist', 1.0]
0 1
['pick', 1.0]
0 1
['dumb', 1.0]
0 1
['regular', 1.0]
0 1
['dinosaurs', 1.0]
0 1
['existed', 1.0]
0 1
['warmer', 1.0]
0 1
['boo', 1.0]
0 1
['groan', 1.0]
0 1
['mention', 1.0]
0 1
['ms', 1.0]
0 1
['612', 1.0]
0 1
['anthony', 1.0]
0 1
['bourdain', 1.0]
0 1
['friendly', 1.0]
0 1
['attacker', 1.0]
0 1
['antitrump', 1.0]
0 1
['avoided', 1.0]
0 1
['orwellian', 1.0]
0 1
['oprah', 1.0]
0 1
['winfrey', 1.0]
0 1
['butler', 1.0]
0 1
['want', 1.0]
0 1
['gvt', 1.0]
0 1
['microchipped', 1.0]
0 1


0 0
['audit', 0]
0 0
['millions', 0]
0 0
['owned', 0]
0 1
['koreans', 1.0]
0 1
['calling', 1.0]
0 1
['assassin', 1.0]
0 1
['visa', 1.0]
0 1
['outlines', 1.0]
0 1
['depopulate', 1.0]
0 1
['planet', 1.0]
0 0
['legendary', 0]
0 0
['kirk', 0]
0 0
['douglas', 0]
0 0
['101st', 0]
0 0
['birthday', 0]
0 0
['trueamericans', 0]
0 0
['nbspthis', 0]
0 0
['website', 0]
0 0
['nbsptrueamericans', 0]
0 0
['resources', 0]
0 1
['monster', 1.0]
0 1
['truck', 1.0]
0 1
['mishap', 1.0]
0 1
['rounds', 1.0]
0 1
['ammo', 1.0]
0 1
['turns', 1.0]
0 1
['hands', 1.0]
0 0
['300', 0]
0 0
['pounds', 0]
0 0
['counterfeit', 0]
0 0
['rat', 0]
0 0
['chicken', 0]
0 0
['wings', 0]
0 0
['tulsa', 0]
0 0
['closes', 0]
0 0
['brutal', 0]
0 1
['came', 1.0]
0 1
['skeleton', 1.0]
0 1
['suddenly', 1.0]
0 1
['resurfaces', 1.0]
0 0
['ни', 0]
0 0
['чего', 0]
0 0
['нового', 0]
0 0
['как', 0]
0 0
['правильно', 0]
0 1
['n', 1.0]
0 1
['crushes', 1.0]
0 1
['throat', 1.0]
0 1
['testifying', 1.0]
0 1
['riot', 1.0]
0 1
['160k', 1.0]
0 1
['tax

1 0
['armed', 1.0]
1 0
['forces', 1.0]
1 1
['pitts', 0.0]
1 0
['renewing', 1.0]
1 0
['rand', 1.0]
1 0
['addresses', 1.0]
1 0
['floridians', 1.0]
1 0
['introduces', 1.0]
1 0
['impeachment', 1.0]
1 0
['hint', 1.0]
1 0
['stirs', 1.0]
1 0
['clash', 1.0]
1 0
['orators', 1.0]
1 0
['spring', 1.0]
1 0
['roosevelt', 1.0]
1 0
['charging', 1.0]
1 0
['malice', 1.0]
1 0
['gingrich', 1.0]
1 0
['tragedy', 1.0]
1 0
['minimum', 1.0]
1 0
['wage', 1.0]
1 0
['hike', 1.0]
1 0
['forward', 1.0]
1 0
['focused', 1.0]
1 0
['creating', 1.0]
1 0
['reforming', 1.0]
1 0
['requested', 1.0]
1 0
['counterterrorism', 1.0]
1 0
['stanford', 1.0]
1 0
['affairs', 1.0]
1 0
['derry', 1.0]
1 0
['nh', 1.0]
1 0
['luncheon', 1.0]
1 0
['afp', 1.0]
1 0
['renews', 1.0]
1 0
['attacks', 1.0]
1 0
['links', 1.0]
1 1
['huey', 0.0]
1 0
['rockefeller', 1.0]
1 0
['pass', 1.0]
1 0
['means', 1.0]
1 0
['infrastructure', 1.0]
1 1
['tout', 0.0]
1 1
['anwr', 0.0]
1 0
['108th', 1.0]
1 0
['子供たちのコト', 1.0]
1 0
['私のコト', 1.0]
1 1
['20067', 0.0]
1 1
['

1 1
['reckless', 0.0]
1 1
['abandon', 0.0]
1 0
['1591', 1.0]
1 0
['104th', 1.0]
1 0
['1995', 1.0]
1 0
['prohibit', 1.0]
1 0
['purposes', 1.0]
1 0
['proposals', 1.0]
1 0
['explore', 1.0]
1 1
['demint', 0.0]
1 1
['mccaul', 0.0]
1 1
['tx', 0.0]
1 1
['appears', 0.0]
1 1
['louise', 0.0]
1 1
['slaughter', 0.0]
1 0
['colbert', 1.0]
1 0
['nafta', 1.0]
1 0
['cnbc', 1.0]
1 0
['discussion', 1.0]
1 0
['honda', 1.0]
1 0
['covering', 1.0]
1 0
['adults', 1.0]
1 0
['importance', 1.0]
1 0
['outreach', 1.0]
1 0
['expansion', 1.0]
1 0
['springfield', 1.0]
1 0
['ill', 1.0]
1 0
['includes', 1.0]
1 0
['cuts', 1.0]
1 0
['stephanie', 1.0]
1 0
['cutter', 1.0]
1 0
['impersonation', 1.0]
1 0
['credible', 1.0]
1 0
['incidents', 1.0]
1 0
['ballots', 1.0]
1 0
['cast', 1.0]
1 0
['gore', 1.0]
1 0
['turn', 1.0]
1 0
['idle', 1.0]
1 0
['activists', 1.0]
1 0
['3year', 1.0]
1 0
['feature', 1.0]
1 0
['enemies', 1.0]
1 0
['当ブログでは福岡県内にある不動産会社についてさまざまな情報をまとめています', 1.0]
1 0
['福岡市の分譲マンションやマンション経営で頼りになる企業をご紹介', 1.0]
1 0
['whethe

In [107]:
def list_to_dict(feature_list):
    """
    feature_list: 2d-array, each entrance is [string, test_stats]
    """
    res = {}
    for i, obj in enumerate(feature_list):
        res[obj[0]] = i+1
    return res

def train_test_processing_features(df, word_features):
    """
    input: dataframe with label
    """
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
    tokenizer = Tokenizer(num_words=len(word_features, lower=False))
    
    tokenizer.word_index = list_to_dict(word_features)
    
    X_train = tokenizer.texts_to_matrix(X_train['title'])
    
    X_test = tokenizer.texts_to_matrix(X_test['title'])

    vocab_size = len(tokenizer.word_index)+1
    
    return X_train, X_test, y_train, y_test

def evaluate_features(X, feature_prc, words, cold_start=True):
    word_features = words[:int(len(words) * feature_prc)]
    X_train, X_test, y_train, y_test = train_test_processing_features(politifact, word_features)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    if cold_start:
        train_predict = clf.predict(X_train)
        test_predict = clf.predict(X_test)
        f1_score_train = f1_score(train_predict, y_train)
        f1_score_test = f1_score(test_predict, y_test)
    else:
        X_train, X_test, y_train, y_test, _ = train_test_processing_features(X, word_features)
        train_predict = clf.predict(X_train)
        test_predict = clf.predict(X_test)
        f1_score_train = f1_score(train_predict, y_train)
        f1_score_test = f1_score(test_predict, y_test)
    print(f'Training F1: {f1_score_train}')
    print(f'Testing F1: {f1_score_test}')
    return f1_score_test

In [108]:
words = sorted(p_score.features, key=lambda x: x[-1])[:500]
X_train, X_test, y_train, y_test = train_test_processing_features(politifact, words)

NameError: name 'p_score' is not defined

In [98]:
forest_clf = RandomForestClassifier(random_state=42)
svc_clf = SVC(C=1.0, random_state=42)
log_clf = LogisticRegression()

In [99]:
evaluate_features(forest_clf, politifact, words)

Training F1: 0.711453744493392
Testing F1: 0.675


In [101]:
evaluate_features(forest_clf, gossipcop, words)

Training F1: 0.6533138485343419
Testing F1: 0.6589386913961874


In [113]:
features = []
for idx, obj in enumerate(forest_clf.feature_importances_):
    features.append([idx, obj])
features = sorted(features, key=lambda x: -x[1])

In [114]:
for idx, _ in features:
    print(words[idx])

['makes', 0]
['leaves', 0]
['francis', 0]
['confirmed', 0]
['nightmare', 0]
['nrtonline', 0]
['suicide', 0]
['barron', 0]
['assange', 0]
['inside', 0]
['written', 0]
['hurricane', 0]
['church', 0]
['eric', 0]
['destroyed', 0]
['bust', 0]
['high', 0.0]
['line', 0]
['receive', 0]
['testify', 0]
['racism', 0]
['choose', 0]
['george', 0]
['shooter', 0]
['times', 0.0]
['fda', 0]
['restore', 0]
['lakes', 0]
['warrant', 0]
['celebs', 0]
['keanu', 0]
['immigrants', 0.0]
['30', 0]
['named', 0]
['largest', 0]
['declares', 0]
['holy', 0]
['meat', 0]
['officers', 0]
['trey', 0]
['terrorist', 0]
['2020', 0]
['whopping', 0]
['elton', 0]
['sum', 0]
['blacks', 0]
['shocking', 0]
['foundation', 0]
['flag', 0]
['man', 0]
['parkland', 0]
['william', 0]
['announces', 0]
['found', 0.0]
['moms', 0]
['accidentally', 0]
['confiscation', 0]
['starbucks', 0]
['caught', 0]
['russia', 0]
['ceremony', 0]
['denzel', 0]
['discover', 0]
['afford', 0]
['florida', 0]
['stay', 0]
['official', 0]
['walkout', 0]
['27ave',