# Train classification models for misinformation detection

## Initialize

In [None]:
######################################################
# Import libraries
######################################################

import pandas as pd
import numpy as np
import regex as re
from collections import Counter
from datetime import date
from tqdm import tqdm
import emoji
from os.path import join

import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stemmer = WordNetLemmatizer()

from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

import joblib
import csv

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, KFold

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

import glob

## Load & inspect data

### Define filepaths

In [None]:
thisday = date.today().strftime("%m%d%y")
DATA_DIR = str() # set path to check for data

# May be multiple labeled data files for focal myth(s)
myth_fps = [fp for fp in glob.glob(DATA_DIR) if
           (fp.endswith('labeled.csv'))
          ]
# negative cases
neg_myth_fps = [fp for fp in glob.glob(join(DATA_DIR, 'negative_cases/*') if
           (fp.endswith('labeled.csv'))
          ]

# specify shorthand for focal myth
# use different, specific naming for multiple myths (e.g., myth1, myth2)
MYTH = str()
myth_vec_fp = f'{MYTH}_VEC_{str(thisday)}.joblib'

### Load & rename data

In [None]:
# For multiple files holding labeled data, load each one and add to empty DF

# focal myth name defined above
myth_df = pd.DataFrame() # initialize
for fp in myth_fps:
    temp = pd.read_csv(fp, low_memory=False) # read file
    myth_df = pd.concat([myth_df, temp]) # add positive tweets to main DF
#add negative tweets
for fp in neg_myth_fps: 
    temp = pd.read_csv(fp, low_memory=False) # read file
    temp = temp[temp.myth_score == 1.0] # keep only negative tweets with score of 1.0 for other myth
    temp = temp.assign(myth_score = 0, myth_supports_score = 0) # change myth scores to zero
    temp = temp.assign(is_myth = "no", is_myth_supports = "no") # change myth labels to "no"
    myth_df = pd.concat([myth_df, temp]) # add negative tweets to main df
    
# Rename each DF's myth columns for clarity
myth_df = myth_df.reset_index(drop=True).rename(
    copy = False, columns = 
    {'is_myth': f'{MYTH}_is_myth', f'{MYTH}_score': f'{MYTH}_myth_score', 
     'is_myth_supports': f'{MYTH}_is_myth_supports', 'myth_supports_score': f'{MYTH}_myth_supports_score'})

myth_df.tail(10)

### Summarize the numerical data

In [None]:
# Look at the number of instances of each is_myth class distribution
print(myth_df.groupby(f'{MYTH}_is_myth').size())
print()

# Look at the number of instances of each is_myth_supports class distribution
print(myth_df.groupby(f'{MYTH}_is_myth_supports').size())
print()

## Preprocess tweet text

### Tweet Preprocessing

In [None]:
def process_tweets(tweet):
    '''
    Preprocesses raw text of a tweet, skipping any retweets. 
    Steps: lower-casing; removing punctuation, newlines, URLs, usernames, and emojis;
    stripping whitespace, replacing hashtags, and finally, lemmatization.
    
    args:
        tweet: raw text of a tweet
    
    returns:
        string: cleaned tweet text
    '''
    
    # Skip retweets and non-strings
    retweet_pattern = r'^RT\s+' # recognize retweets by starting with 'RT'
    if not isinstance(tweet, str) or re.search(retweet_pattern, tweet):
        return ''
    
    # Convert to lowercase
    tweet = tweet.lower()
    
    # Remove punctuation with regex: match all punctuation (\p{P}) and symbols (\p{S}), 
    # then check that it is not a wasn't a hashtag or @-symbol using a negative look-behind.
    punc_pattern = r"[\p{P}\p{S}](?<![@#\'\/:])"
    tweet = re.sub(punc_pattern, "", tweet)   
    
    # Repair hashtag and remove newline character
    # from text_helpers.tweet_text_cleanup
    tweet = tweet.replace("# ", "#")
    tweet = tweet.replace("\n", " ")
    
    # remove URLs and @mentions
    # Simple regular expression to match URLs starting with `https` or `http`
    # More complex regex an be found here: https://mathiasbynens.be/demo/url-regex
    url_regex = r"https?://\S*"
    # Regex to match mentions
    mention_regex = r"@\S*"
    tweet = re.sub(url_regex, "", tweet)
    tweet = re.sub(mention_regex, "", tweet)
        
    # Remove additional white spaces
    whitespace_pattern = r'\s+'
    tweet = re.sub(whitespace_pattern, ' ', tweet) # strip whitespaces in between words
    tweet = tweet.strip() # strip whitespaces at start & end
    
    # Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    # Remove emojis
    tweet = emoji.get_emoji_regexp().sub(u'', tweet)
    
    # Lemmatization
    tweet = tweet.split()
    tweet = ' '.join([stemmer.lemmatize(word) for word in tweet])
    
    return tweet


myth_df['text_cleaned'] = myth_df['text'].apply(lambda x: process_tweets(x))

### Vectorize texts

In [None]:
# Use TFIDF weighted DTM because does better overall than unweighted
myth_vectorizer = TfidfVectorizer(max_features=10000, min_df=1, max_df=0.8, stop_words=stopwords.words('english')) # TFIDF

# creates sparse DTM X
# use X.toarray() to get with zero representation

myth_tweets = [] # make empty list to add tweets to
myth_df['text_cleaned'].apply(lambda x: myth_tweets.append(x)) # add tweet from each row of DF

X_myth = myth_vectorizer.fit_transform(myth_tweets)

# save vectorizers
joblib.dump(myth_vectorizer, open(myth_vec_fp, "wb"))


print('Number of features in vectorizer (total vocabulary):', len(myth_vectorizer.get_feature_names()))
print()

print(myth_vectorizer.get_feature_names()[::100]) # get every 100th word

### Specify the data

In [None]:
df_is_myth = myth_df[['text_cleaned',f'{MYTH}_is_myth']].copy()

df_is_myth_supports = myth_df[['text_cleaned',f'{MYTH}_is_myth_supports']].copy()

### Convert No/Yes to [0,1]

In [None]:
def no_yes_convert(convert_df, column_name, has_unsure = False):
    '''
    args
        convert_df: df containing column to convert
        column_name: column to convert from 'yes','no','unsure' to float. Scoring scheme:
            no: 0
            unsure: 0.5
            yes: 1
        has_unsure: boolean, indicates whether convert_df has 'unsure' in column_name
    '''
    
    # Already converted to float
    if convert_df[column_name].dtype == 'float64':
        return convert_df
    
    new_df = convert_df.loc[:, convert_df.columns != column_name]
    
    for num in range(0,len(new_df)):
        row_index = new_df.index[num]
        
        if convert_df.loc[num,column_name] == 'no' or convert_df.loc[num,column_name] == 'unsure':
            new_df.loc[row_index,column_name] = 0.0
            
        elif convert_df.loc[num,column_name] == 'yes':
            new_df.loc[row_index,column_name] = 1.0
            
#         elif has_unsure == True and convert_df.loc[num,column_name] == 'unsure':
#             new_df.loc[row_index,column_name] = 0.5
            
    return new_df

df_is_myth = no_yes_convert(df_is_myth,f'{MYTH}_is_myth')

## Setup for modeling

### Balance x_train, y_train

In [None]:
def resample_data(X_train, Y_train, undersample = False, sampling_ratio = 0.5):
    """
    Args:
        X_train: X training data
        Y_train: Y training data
        undersample: boolean for over or undersampling
        sampling_ratio: ratio of minority to total
        
        archived/not used:
        sampling_strategy: strategy for resampled distribution
            if oversample: 'majority' makes minority = to majority
            if undersample: 'minority' makes majority = to minority
            
    Returns:
        X_balanced: predictors at balanced ratio
        Y_balanced: outcomes at balanced ratio
    """
    
    if undersample == True:
        #TODO: Implement real_sampling_ratio for undersample
        undersample = RandomUnderSampler(sampling_strategy=sampling_ratio)
        X_balanced, Y_balanced = undersample.fit_resample(X_train, Y_train)
    else:
        # real_sampling_ratio is the ratio of the minority to majority, keeping majority constant
        real_sampling_ratio = ((sampling_ratio * len(Counter(Y_train)))/(1-sampling_ratio))/len(Counter(Y_train))
        oversample = RandomOverSampler(sampling_strategy=real_sampling_ratio)
        X_balanced, Y_balanced = oversample.fit_resample(X_train, Y_train)
    
    print(f'Y_train: {Counter(Y_train)}\nY_resample: {Counter(Y_balanced)}')
    
    return X_balanced, Y_balanced

In [None]:
def compute_predictions(text, vectorizer_model, class_model):
    '''
    Predicts the label for an input text using a given model trained to classify the texts. 
    Uses vectorizer_model to restrict the vocab of the input text so it's consistent with vocab in class_model (avoids errors).
    
    Args:
        text: preprocessed text in format of list of sentences, each a str or list of tokens
        vectorizer_model: fitted text vectorizer
        class_model: trained classification model
    Returns:
        label: label for text predicted by model, false for tie
        prob: probability for label
    '''
    
    X = vectorizer_model.transform(text) # create TF-IDF-weighted DTM from text
    try:
        probabilities = class_model.predict_proba(X)
    except: 
        return
    
    label = 'no'
    prob_no = probabilities[0][0]
    prob_yes = probabilities[0][1]
    
    # predicted label is one with greater probability
    if probabilities[0][0] < probabilities[0][1]:
        label = 'yes'
        
    return label, prob_yes, prob_no

### Setup 10-fold cross validation for model evaluation

In [None]:
# Define test options for k-fold CV
num_folds = 10 
seed = 3
scoring='f1_weighted' # set scoring metric (not used here)

def show_kfold_output(models,  
                      X, 
                      Y, 
                      df, 
                      text_col, 
                      vectorizer, 
                      num_folds = num_folds, 
                      random_state = seed, 
                      shuffle = True):
    '''
    Estimates the accuracy of different model algorithms, adds results to a results array and returns.
    Prints the accuracy results: averages and std.
    Uses cross_val_predict, which unlike cross_val_score cannot define scoring option/evaluation metric.
    
    Args:
        models: list of (name, model) tuples
        X: predictors
        Y: outcomes
        num_folds: Split data randomly into num_folds parts: (num_folds-1) for training, 1 for scoring
        random_state: seed
        shuffle: 
    
    Returns:
        results: list of model results
        names: list of model names (matches results)
        
    Source: 
        https://stackoverflow.com/questions/40057049/using-confusion-matrix-as-scoring-metric-in-cross-validation-in-scikit-learn
    '''
    
    results = []
    names = []
    
    for name, model in models:
        # Print name of model
        print(f'{name}:')
        print()
        
        # Setup model options
        kfold = KFold(
            n_splits=num_folds, 
            random_state=seed, 
            shuffle=True)
        
        # Get kfold results
        cv_results = cross_val_predict(
            model.fit(X, Y), 
            X, 
            Y, 
            cv=kfold, 
            #scoring=scoring, 
            n_jobs=-1) # use all cores = faster
        
        # Add results and name of each algorithm to the model array
        results.append(cv_results)
        names.append(name)
        
        # Validation step: Predict class probabilities in labeled data, compare to actual labels
        try:
            tqdm.pandas(desc="Computing predictions")
            df[['prediction', 'prediction_prob_yes', 'prediction_prob_no']] = df[text_col].progress_apply(
            lambda text: pd.Series(compute_predictions([sent for sent in sent_tokenize(text)], vectorizer, model)))
            print("Distribution of predicted labels:\n", df['prediction'].value_counts()) # show predicted distribution, compare to labeled distribution
            print()
        except: 
            pass
        
        # Print results
        print(f'Mean (std):\t {round(cv_results.mean(),4)} ({round(cv_results.std(),4)})')
        print(f'Accuracy:\t', {round(accuracy_score(Y, cv_results)), 4})
        print()
        print('Confusion matrix:\n', confusion_matrix(Y, cv_results))
        print()
        print('Report:\n', classification_report(Y, cv_results))
        print()
        
    # Return arrays
    return results, names

## Evaluate algorithms

### Prepare training and validation data

In [None]:
# Separate training and final validation data set. First remove class
# label from data (X). Setup target class (Y)
# Then make the validation set 10% of the entire
# set of labeled data (X_validate, Y_validate)

valueArray = df_is_myth.values
Y = valueArray[:,1]
Y = Y.astype('float')
test_size = 0.2
seed = 3
X_train, X_validate, Y_train, Y_validate = train_test_split(
    X_myth, 
    Y, 
    test_size=test_size, 
    random_state=seed)

print(f'Y_train Distribution: {Counter(Y_train).most_common()}')

In [None]:
######################################################
# Oversample to desirable ratio
######################################################

sampling_ratio = 0.5 # ratio of minority to total cases
undersample = False # whether to undersample or oversample

X_balanced, Y_balanced = resample_data(
    X_myth, #X_train, 
    Y, #Y_train, 
    undersample=undersample, 
    sampling_ratio=sampling_ratio)

### 10-Fold Cross Validation

In [None]:
# Use different algorithms to build models
models = []
models.append(('K-Nearest Neighbors (KNN)', KNeighborsClassifier()))
models.append(('Random Forest (RF)', RandomForestClassifier(n_estimators=40, random_state=seed)))
models.append(('Decision Tree (DT)', DecisionTreeClassifier(random_state=seed)))
models.append(('Multinomial Naive Bayes (MNB)', MultinomialNB()))
models.append(('Logistic Regression (LR)', LogisticRegression(random_state=seed)))
models.append(('Support Vector Machine (SVM)', SVC(gamma='auto')))
models.append(('Multi-Layer Perceptron (MLP)', MLPClassifier(max_iter=100, activation='relu')))

# Baseline: distribution of labeled data
print(f'Distribution of labeled disinfectants tweets: {Counter(Y_balanced).most_common()}')
print()

# Evaluate algorithms using 10-fold cross validation
results, names = show_kfold_output(models=models, 
                                   X=X_balanced,
                                   Y=Y_balanced, 
                                   df=myth_df, 
                                   text_col='text_cleaned', 
                                   vectorizer=myth_vectorizer)

In [None]:
# Save best model

# Model file output format
best_model_suffix = "LR" # this model has best accuracy
thisday = date.today().strftime("%m%d%y")

MYTH_NAME_mod_fp = f'classifier_{MYTH}_{str(best_model_suffix)}_{str(thisday)}.joblib'

best_model = LogisticRegression(random_state=seed).fit(X_train, Y_train)

joblib.dump(best_model, MYTH_NAME_mod_fp)

### Inspect featured keywords (Random Forest)

In [None]:
# Get feature names
# feature_names = [f"feature {i}" for i in range(X_balanced.shape[1])]
feature_names = myth_vectorizer.get_feature_names_out()
rf_model = RandomForestClassifier(n_estimators=40, random_state=seed).fit(X_balanced, Y_balanced)

# Get feature importances from the randome forest model
importances = rf_model.feature_importances_
std = np.std(
    [tree.feature_importances_ for tree in rf_model.estimators_], axis=0)

print(type(feature_names))
print(type(X_balanced))

print(X_balanced.shape)
print(feature_names.shape)

# Build dataframe
forest_importances = pd.DataFrame()
forest_importances["importances"] = importances
forest_importances["std"] = std
forest_importances.index = feature_names
forest_importances = forest_importances.sort_values(
    by=["importances", "std"], ascending=False)

# See top k important features
top_k = 10
top_k_forest_importances = forest_importances.head(top_k)
print(list(top_k_forest_importances.index))

In [None]:
fig, ax = plt.subplots()
top_k_forest_importances["importances"].plot.bar(
    yerr=top_k_forest_importances["std"], ax=ax)
ax.set_title("Feature importances")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()