In [1]:
import os
import codecs
import pandas as pd
import string
import emoji
import re
import textstat
import numpy as np
import nltk
import time

from multiprocessing.dummy import Pool as ThreadPool
from pandas import DataFrame, read_csv
from matplotlib import pyplot
from nltk import ngrams
from nltk.parse import stanford
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import preprocessing
from sklearn.svm import SVC, SVR
from scipy.sparse import hstack
from textblob import TextBlob as blob_en
from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer
from textblob_de import TextBlobDE as blob_de
from langdetect import detect

from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger()


nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('tagsets')

  from numpy.core.umath_tests import inner1d


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\vladc\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\vladc\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\vladc\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [2]:
def read_file(file_name):
    tweetId = []
    tweetText = []
    userId = []
    imageId = []
    username = []
    timestamp = []
    label = []
    index = []
    
    path = os.path.join(r"D:\Python\media eval", file_name)
    f = open(path, encoding="utf8")
    
    ok = True
    for line in f.readlines():
        lines = line.strip('\n').split("\t")
        if ok:
            ok = False
            index = lines
        else:
            tweetId.append(lines[0])
            tweetText.append(lines[1])
            userId.append(lines[2])
            imageId.append(lines[3])
            username.append(lines[4])
            timestamp.append(lines[5])
            label.append(lines[6])
    f.close()
    
    df = DataFrame({
        index[0] : tweetId, 
        index[1]: tweetText, 
        index[2]: userId, 
        index[3]: imageId, 
        index[4]: username, 
        index[5]: timestamp, 
        index[6]: label})
    return df

In [3]:
train_df = read_file(r"mediaeval-2015-trainingset.txt")
test_df = read_file(r"mediaeval-2015-testset.txt")

In [4]:
# preprocessing

print(train_df["label"].unique())
train_df["label"] = np.where(train_df["label"] == "humor", "fake", train_df["label"])
print(train_df["label"].unique())

['fake' 'humor' 'real']
['fake' 'real']


In [5]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 3),
    max_features=100)

In [6]:
class Sentiment(object): 

    def clean_tweet(self, tweet): 
        ''' 
        Utility function to clean tweet text by removing links, special characters 
        using simple regex statements. 
        '''
        return ' '.join(re.sub("(@[A-Za-zÀ-ÿ0-9]+)|([^0-9A-Za-zÀ-ÿ \t])|(\w+:\/\/\S+)", " ", tweet).split())  
    

    def get_tweet_sentiment(self, tweet):
        try:
            language = detect(tweet)
            analysis = blob_en(self.clean_tweet(tweet))
        except:
            return [0.0,0.0]
        
        if language == 'fr':
            analysis = blob_en(self.clean_tweet(tweet), pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
        elif language == 'de':
            analysis = blob_de(self.clean_tweet(tweet))

        return (analysis.sentiment[0], analysis.sentiment[1])

In [7]:
def detect_lang(df):
    lang = []
    for text in df["tweetText"]:
        try:
            lang.append(detect(text))
        except:
            lang.append('en')
    return lang

def count_char(df1, df2, l, s):
    for p in l:
        p_list = []
        for text in df2["tweetText"]:
            p_list.append(text.count(p))
        df1[s + p] = p_list
    return df1

def normalize(df):
    x = df.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = DataFrame(x_scaled, columns=[c for c in df])
    return df

def analyse(df1, df2): 
    analyser = Sentiment() 
    
    sen_list = []
    sub_list = []
    for text in df2["tweetText"]:
        sentiment = analyser.get_tweet_sentiment(text)
        sen_list.append(sentiment[0])
        sub_list.append(sentiment[1])
    df1["sentiment"] = sen_list
    df1["subjectivity"] = sub_list
    
    return df1

def encode_label(l):
    labels = []
    for text in l:
        if text == "fake":
            labels.append(0)
        else:
            labels.append(1)
    return labels

def pos_tag(df1, df2):
    pool = ThreadPool(4) 
#     results = pool.map(my_function, my_array)
    
    pos_pd = DataFrame()
    for text in df2["tweetText"]:
        tokens = nltk.word_tokenize(text)
        tokens = nltk.pos_tag(tokens)
        
        tokens = [x[1] for x in tokens]
        
        dummies = pd.get_dummies(tokens, prefix = "pos_tag")
        df1 = pd.concat([df1, dummies], axis=1)
        
    print (df1.shape)
    return df1

def helper(text):
    tagset = None
    tokens = nltk.word_tokenize(text)
    tokens = nltk.tag._pos_tag(tokens, tagset, tagger)
    tokens = [x[1] for x in tokens]
    
    return tokens
    
def work(df1, df2):
    pool = ThreadPool(8) 
    results = pool.map(helper, df2["tweetText"].values)

    rez = []
    for x in results:
        str = ""
        for tok in x:
            str = str + tok + " "
        rez.append(str)
    pos_pd = pd.DataFrame(rez, columns=["Sin"])
    
    word_vectorizer.fit(pos_pd["Sin"])
    n_grams = word_vectorizer.transform(pos_pd["Sin"])
    
    temp = DataFrame(n_grams.toarray())
    print ("Shape before conncat: ", df1.shape)
    df1 = pd.concat([df1, temp], axis = 1)
    print ("Shape after conncat: ", df1.shape)    
        
    return df1
    
# def pos_tag(df1, df2):
#     pool = ThreadPool(4) 
#     results = pool.map(my_function, my_array)
    
#     pos_pd = DataFrame()
#     for text in df2["tweetText"]:
#         tokens = nltk.word_tokenize(text)
#         tokens = nltk.pos_tag(tokens)
        
#         tokens = [x[1] for x in tokens]
        
#         dummies = pd.get_dummies(tokens, prefix = "pos_tag")
#         df1 = pd.concat([df1, dummies], axis=1)
        
#     print (df1.shape)
#     return df1
    
def reading_score(df1, df2):
    flesch = []
    automated = []
    dale = []
    difficult = []
    
    for line in df2["tweetText"]: 
        flesch.append(textstat.flesch_reading_ease(line))
        automated.append(textstat.automated_readability_index(line))
        dale.append(textstat.dale_chall_readability_score(line))
        difficult.append(textstat.difficult_words(line))
    
    df1["flesch"] = flesch
    df1["automated"] = automated
    df1["dale"]= dale
    df1["difficult"] = difficult
    
    return df1

def basic_metrics(df1, df2):
    df1["len"] = [len(x) for x in df2["tweetText"]]
    df1["word_len"] = [len(x.split()) for x in df2["tweetText"]]
    df1["upper"] = [sum(1 for c in x if c.isupper()) for x in df2["tweetText"]]
    
    return df1

In [8]:
emojis = ["😂","😳","😱","😭","😢","⚡","☔","🌀","😨","🌊","❤","🗽","🏃","👎"]
languages = ["lang_en", "lang_fr", "lang_de", "lang_es"]

def feature_selection(df):
    feature_df = DataFrame()
    
    #ngrams
    start = time.time()
    word_vectorizer.fit(df["tweetText"])
    n_grams = word_vectorizer.transform(df["tweetText"])
    
    temp = DataFrame(n_grams.toarray())
    feature_df = pd.concat([feature_df, temp])
    print ("N-grams: ", time.time() - start)
    
#     start = time.time()
#     feature_df = pos_tag(feature_df, df)
#     print ("Pos_tag:" , time.time() - start)
    
    start = time.time()
    feature_df = work(feature_df, df)
    print ("Pos_tag:" , time.time() - start)
    
    start = time.time()
    feature_df = analyse(feature_df, df)
    feature_df = reading_score(feature_df, df)
    print ("Reading Score:" , time.time() - start)
    
    start = time.time()
    feature_df = basic_metrics(feature_df, df)
    print ("Basic_Metrics:" , time.time() - start)
    
    start = time.time()
    feature_df["lang"] = detect_lang(df) 
    dummies = pd.get_dummies(feature_df["lang"], prefix = "lang")
    feature_df = pd.concat([feature_df, dummies], axis=1)
    feature_df = feature_df.drop(["lang"], axis=1)
    print ("Language:" , time.time() - start)
    
    for c in feature_df:
        try:
            if not c in languages and "lang_" in c:
                feature_df = feature_df.drop(c, axis=1)
        except:
            None
    
    start = time.time()
    feature_df = count_char(feature_df, df, string.punctuation, "char_")
    feature_df = count_char(feature_df, df, emojis, "emoji_")
    print ("Char+Emoji:" , time.time() - start)
    
    feature_df["label"] = encode_label(df["label"])
    return feature_df

In [9]:
test_feature_df = feature_selection(test_df)
print (test_feature_df.shape)

train_feature_df = feature_selection(train_df)
print (train_feature_df.shape)


N-grams:  0.2940640449523926
Shape before conncat:  (3781, 100)
Shape after conncat:  (3781, 200)
Pos_tag: 3.6535422801971436
Reading Score: 20.936655044555664
Basic_Metrics: 0.02800154685974121
Language: 18.05154538154602
Char+Emoji: 0.14294958114624023
(3781, 260)
N-grams:  1.070641279220581
Shape before conncat:  (14483, 100)
Shape after conncat:  (14483, 200)
Pos_tag: 10.957834243774414
Reading Score: 74.65419578552246
Basic_Metrics: 0.08410191535949707
Language: 68.72832870483398
Char+Emoji: 0.3479037284851074
(14483, 260)


In [10]:
train_feature_df = normalize(train_feature_df)
test_feature_df = normalize(test_feature_df)

copy_train = train_feature_df
copy_test = test_feature_df

In [14]:
selector = SelectKBest(chi2, k=30)
selector.fit(train_feature_df.drop("label", axis=1), train_feature_df["label"])

cols = selector.get_support(indices=True)

# Create new dataframes with desired columns
X_train = train_feature_df.drop(["label"], axis=1).iloc[:, cols]
X_test = test_feature_df.drop(["label"], axis=1).iloc[:, cols]

Y_train = train_feature_df["label"]
Y_test = test_feature_df["label"]

In [12]:
l = []
for c in X_train:
    l.append(c)
print("Features:\n", l)

Features:
 [2, 3, 4, 5, 7, 10, 13, 14, 21, 22, 24, 33, 37, 38, 39, 42, 44, 46, 51, 52, 53, 55, 56, 57, 58, 63, 65, 67, 70, 71, 75, 76, 78, 79, 80, 81, 82, 83, 85, 86, 87, 89, 90, 91, 93, 94, 96, 99, 13, 44, 45, 52, 62, 76, 85, 'lang_en', 'lang_es', 'char_(', 'char_-', 'char_@']


In [None]:
#Grid Search for hyperparameter classifiers

def grid_search(X_train, Y_train, cls, param_grid):
    grid_search = GridSearchCV(cls, param_grid, cv=5, scoring="accuracy")
    grid_search.fit(X_train, Y_train)
    
    return grid_search.best_estimator_

def best_cls(X_train, X_test, Y_train, Y_test): 
    #Ridge
    ridge = RidgeClassifier()
    param_grid = [{"alpha" : range(1, 101)}]
    ridge = grid_search(X_train, Y_train, ridge, param_grid)
    
    predictions_ridge = ridge.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions_ridge)
    print("Ridge:", accuracy)
    
    #Logistic
    log = LogisticRegression()
    param_grid = [{"solver" : ["liblinear", "newton-cg", "lbfgs", "liblinear", "sag", "saga"]}]
    log = grid_search(X_train, Y_train, log, param_grid)
    
    predictions_log = log.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions_log)
    print("Logistic:", accuracy)
    
    #Support Vector
    svc = SVC()
    param_grid = [{"C" : np.linspace(0.1, 1, 9), "gamma" : ["auto"]}]
    svc = grid_search(X_train, Y_train, svc, param_grid)
    
    predictions_svc = svc.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions_ridge)
    print("SVC:", accuracy)
    
    #Randomm Forrest
    rfc = RandomForestClassifier()
    param_grid = [{"n_estimators" : [1000], "max_depth" : range(1, 10, 1), "min_samples_leaf" : range(1, 10, 1)}]
    rfc = grid_search(X_train, Y_train, rfc, param_grid)
    
    predictions_rfc = rfc.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions_rfc)
    print("RFC:", accuracy)
    
    return ridge, log, svc, rfc

ridge, log, svc, rfc = best_cls(X_train, X_test, Y_train, Y_test)

Ridge: 0.8381380587146258
Logistic: 0.8688177730759058


In [13]:
#Best classifiers after runninng the grid search

def print_score(pred, label, name):
    cm = confusion_matrix(pred, label)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm = cm.diagonal()
    
    print(name, ":")
    print("F1:", accuracy_score(label, pred), 
          "; F1 fake:", cm[0], 
          "; F1 real:", cm[1], 
          "; Roc Auc:", roc_auc_score(label, pred), 
          "\n")

def train_predict(cls, X_train, Y_train, X_test, Y_test, name):
    cls.fit(X_train, Y_train)
    predictions = cls.predict(X_test)
    
    print_score(predictions, Y_test, name)
    
    return cls


ridge = RidgeClassifier(alpha=1, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001)


log = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)


svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# fine-tuned: max_depth=2
# grid-search: max_depth=9
rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
rfc = RandomForestClassifier(max_depth=2, n_estimators=1000 )

voting = VotingClassifier(estimators=[("log", log), ("svc", svc), ("rfc", rfc)], voting="hard")

ridge = train_predict(ridge, X_train, Y_train, X_test, Y_test, "Ridge")
log = train_predict(log, X_train, Y_train, X_test, Y_test, "Logistic")
svc = train_predict(svc, X_train, Y_train, X_test, Y_test, "Support Vector Clasifier")
rfc = train_predict(rfc, X_train, Y_train, X_test, Y_test, "Random Forrest Classifier")
voting = train_predict(voting, X_train, Y_train, X_test, Y_test, "Voting")

Ridge :
F1: 0.7632901348849511 ; F1 fake: 0.828672705789681 ; F1 real: 0.6296296296296297 ; Roc Auc: 0.7315782524480929 

Logistic :
F1: 0.7553557259984132 ; F1 fake: 0.772349617813227 ; F1 real: 0.689119170984456 ; Roc Auc: 0.67176838264985 

Support Vector Clasifier :
F1: 0.678391959798995 ; F1 fake: 0.8091743119266055 ; F1 real: 0.5003123048094941 ; Roc Auc: 0.6730816808678921 

Random Forrest Classifier :
F1: 0.8799259455170589 ; F1 fake: 0.8504983388704319 ; F1 real: 0.9948119325551232 ; Roc Auc: 0.8143391142383576 

Voting :
F1: 0.8384025390108437 ; F1 fake: 0.8498745969186672 ; F1 real: 0.806060606060606 ; Roc Auc: 0.7904138844271931 



  if diff:


In [30]:
#Ensemble learning

voting = VotingClassifier(estimators=[("log", log), ("svc", svc), ("rfc", rfc)], voting="hard")
# voting.fit(X_train, Y_train)
# print_score(predictions, Y_test, "Voting")

voting = train_predict(voting, X_train, Y_train, X_test, Y_test, "Voting")

Ridge :
F1: 0.8709336154456493 ; F1 fake: 0.8642105263157894 ; F1 real: 0.8915145005370569 ; Roc Auc: 0.8213066772465475 



  if diff:
