In [1]:
import en_core_web_sm
import pandas as pd
import numpy as np
from langdetect import detect
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
df = pd.read_csv('../data/data.csv')
df = df.dropna(subset=['text', 'label'])
# df = df[:100]
df.head()

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


In [3]:
# Get number of stop words
stop_words = set(stopwords.words('english'))
df['num_of_stopwords'] = df['text'].str.split().apply(lambda x: len(set(x) & stop_words))

In [4]:
# Get number of '@' symbol
df['num_of_@'] = df['text'].str.count('@')

In [5]:
# Get number of '/' symbol
df['num_of_/'] = df['text'].str.count('/')

In [6]:
# Get number of '#' symbol
df['num_of_#'] = df['text'].str.count('#')

In [7]:
# Get number of '?' symbol
df['num_of_?'] = df['text'].str.count('\?')

In [8]:
# Get number of lowercase symbols
df['lowercase'] = df['text'].str.findall(r'[a-z]').str.len()

In [9]:
# Get number of uppercase symbols
df['uppercase'] = df['text'].str.findall(r'[A-Z]').str.len()

In [10]:
# Get number of numeric values
df['numeric'] = df['text'].str.findall(r'[0-9]').str.len()

In [11]:
# Get text language and encode
df['language'] = df['text'].apply(detect)

In [12]:
# Get length of text
df['length'] = df['text'].str.len()

In [13]:
# Get number of words
df['word_count'] = df['text'].str.split().str.len()

In [14]:
# Get avg length of each word
df['avg_word_len'] = df["text"].apply(lambda x: np.mean([len(w) for w in x.split()]))

In [15]:
# Get Sentiment using VADER
sent = SentimentIntensityAnalyzer()
positive = [round(sent.polarity_scores(i)['pos'], 2) for i in df['text']]
negative = [round(sent.polarity_scores(i)['neg'], 2) for i in df['text']]
neutral = [round(sent.polarity_scores(i)['neu'], 2) for i in df['text']]
compound = [round(sent.polarity_scores(i)['compound'], 2) for i in df['text']]

df['pos_sent'] = positive
df['neg_sent'] = negative
df['neut_sent'] = neutral
df['comp_sent'] = compound

In [16]:
# Get Spacey named entities using web model
nlp = en_core_web_sm.load()
df['persons'] = 0
df['norps'] = 0
df['facs'] = 0
df['orgs'] = 0
df['gpes'] = 0
df['products'] = 0
df['events'] = 0
df['arts'] = 0
df['laws'] = 0
df['langs'] = 0
df['dates'] = 0
df['times'] = 0
df['percents'] = 0
df['moneys'] = 0
df['ords'] = 0
df['cards'] = 0
df['quants'] = 0
df['location'] = 0

def get_entities(row):
    tokens = nlp(''.join(str(row['text'])))
    for ent in tokens.ents:
        if ent.label_ == 'PERSON':
            row['persons'] += 1
        if ent.label_ == 'NORP':
            row['norps'] += 1
        if ent.label_ == 'FAC':
            row['facs'] += 1
        if ent.label_ == 'ORG':
            row['orgs'] += 1
        if ent.label_ == 'GPE':
            row['gpes'] += 1
        if ent.label_ == 'PRODUCT':
            row['products'] += 1
        if ent.label_ == 'EVENT':
            row['events'] += 1
        if ent.label_ == 'WORK_OF_ART':
            row['arts'] += 1
        if ent.label_ == 'LAW':
            row['laws'] += 1
        if ent.label_ == 'LANGUAGE':
            row['langs'] += 1
        if ent.label_ == 'DATE':
            row['dates'] += 1    
        if ent.label_ == 'TIME':
            row['times'] += 1    
        if ent.label_ == 'PERCENT':
            row['percents'] += 1    
        if ent.label_ == 'MONEY':
            row['moneys'] += 1    
        if ent.label_ == 'ORDINAL':
            row['ords'] += 1    
        if ent.label_ == 'CARDINAL':
            row['cards'] += 1
        if ent.label_ == 'QUANTITY':
            row['quants'] += 1  
        if ent.label_ == 'LOC':
            row['location'] += 1    
    return row

df_features = df.apply(lambda row: get_entities(row), axis=1)

In [17]:
# Fix label problem
df_features['label'] = df_features['label'].str.lower()

In [18]:
# Encode categorical variables
df_features = pd.get_dummies(df_features, columns=['language', 'source', 'label'])

In [19]:
# Drop unnecessary columns
df_features = df_features.drop(columns=['title', 'text', 'label_true', 'source_zavtra.ru'])

In [20]:
df_features.head()

Unnamed: 0,num_of_stopwords,num_of_@,num_of_/,num_of_#,num_of_?,lowercase,uppercase,numeric,length,word_count,...,source_southfront.org,source_strategic-culture.org,source_ttps://www.rand.org,source_twitter,source_utro.ru,source_vivifyholistic.ca,source_www.purevitalsilver.com,source_www.rt.com,source_www.who.int,label_fake
0,12,0,0,0,0,312,3,0,387,65,...,0,0,0,0,0,0,0,0,0,1
1,13,0,0,0,0,198,15,5,266,42,...,0,0,0,0,0,0,0,0,0,1
2,13,0,0,0,0,201,20,5,278,44,...,0,0,0,0,0,0,0,0,0,1
3,4,1,0,0,0,73,6,0,99,18,...,0,0,0,0,0,0,0,0,0,1
4,7,2,0,0,2,103,7,0,137,22,...,0,0,0,0,0,0,0,0,0,1


In [21]:
df_4 = df_features[['persons', 'norps', 'facs', 'orgs', 'gpes', 'products', 'events', 'arts', 'laws', 'langs', 'dates', 'times', 'percents', 'moneys', 'ords', 'cards', 'quants', 'location', 'label_fake']]
df_4.to_csv('../data/datasets/4.csv', index=False)

In [22]:
# df_2 = df_features
# df_2.to_csv('../data/datasets/2.csv')

In [23]:
df_label = pd.read_csv("1.csv")
label = df_label[['label_fake']]
label.head()

FileNotFoundError: [Errno 2] No such file or directory: '1.csv'

In [None]:
# df = pd.read_csv('1.csv')
# print (df[df.label_fake == 0].shape[0])