# TF-IDF and Cosine Similarity

In [1]:
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('/home/elenaruiz/Documents/FNC')
import pandas as pd 
import numpy as np 
from src.utils import io

from src.fake_news_detector.core.encoders import tfidf as tf 
from src.fake_news_detector.core.classificators import SupportVectorMachine as svm_controller

## 1. Import `dataset_content.json`

In [2]:
articles = io.read_json_file('/home/elenaruiz/Documents/FNC/src/data/dataset_content.json')
df = pd.DataFrame(data=articles['articles']) # Put in pandas dataframe

In [38]:
def join_lists(dataset, word_lists):
    result = []
    for _, row in dataset.iterrows():
        text_join = ""
        for feature in word_lists:
            doc_list = row[feature]
            text_join += ' '.join(doc_list)
        result.append(text_join)
    return result  

In [4]:
dataset = pd.DataFrame()
dataset['text'] = join_lists(df, ['all_word'])
dataset['label'] = df['fake']*1
dataset.head()

Unnamed: 0,text,label
0,find corpse vegetarian restaurant Bangkok find...,1
1,switzerland warn authorize extradition politic...,1
2,navarre censor Songs Amaral Shakira song Madma...,1
3,woman pretend blind years greet people Now tru...,1
4,arrested ejaculate boss coffee last four years...,1


## 2. Split datasets

In [5]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=42)

## 3. Create vocabulary

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv_values = cv.fit_transform(dataset['text'].values)
cv_values.shape

(137, 6713)

In [9]:
# 1. Split datasets in real and false
df_train_real = df_train.loc[df_train['label'] == 0]
df_train_fake = df_train.loc[df_train['label'] == 1]

In [10]:
# 2. Transform each group from text to vocabulary
cv_train_real = cv.transform(df_train_real['text'])
cv_train_fake = cv.transform(df_train_fake['text'])
print(cv_train_real.shape,cv_train_fake.shape)

(52, 6713) (57, 6713)


## 4. Create TF-IDF models

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
# 3. Create models for each
tfidf_model_real = TfidfTransformer(use_idf=True).fit(cv_train_real)
tfidf_model_fake = TfidfTransformer(use_idf=True).fit(cv_train_fake)

In [39]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results, feature_vals

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

### 4.1 Fake news relevant words
We want to get the top relevant words of Fake News documents by the TF-IDF create, so we compute from fake news text to tfidf weights.

In [21]:
tf_words_fake_train = tfidf_model_fake.transform(cv_train_fake)

Now, we will sort the words of dictionary by weitgh and get the top_n more relevant words:

In [22]:
topn = 200
results_fake, top_fake_words = tf.get_topn_relevant_words(cv, tf_words_fake_train, topn)

In [40]:
sorted_items=sort_coo(tf_words_fake_train.tocoo())
feature_names =cv.get_feature_names()
results, top_fake_words = extract_topn_from_vector(feature_names, sorted_items, 600)

In [41]:
results

{'rice': 0.689,
 'cheese': 0.637,
 'dog': 0.624,
 'restaurant': 0.5,
 'ikea': 0.5,
 'switzerland': 0.481,
 'crocodiles': 0.48,
 'sexual': 0.458,
 'foreign': 0.442,
 'songs': 0.438,
 'cor': 0.435,
 'wax': 0.428,
 'guitar': 0.424,
 'day': 0.155,
 'water': 0.152,
 'beach': 0.41,
 'echenique': 0.41,
 'libya': 0.409,
 'semen': 0.405,
 'store': 0.4,
 'prisoners': 0.379,
 'alejandro': 0.395,
 'blind': 0.39,
 'museum': 0.385,
 'pastor': 0.384,
 'cold': 0.383,
 'women': 0.202,
 'drivers': 0.192,
 'use': 0.149,
 'military': 0.373,
 'crush': 0.371,
 'feminist': 0.37,
 'ugly': 0.368,
 'pp': 0.366,
 'guindos': 0.36,
 'melilla': 0.359,
 'police': 0.153,
 'forest': 0.355,
 'minimum': 0.354,
 'control': 0.206,
 'vallecas': 0.346,
 'office': 0.344,
 'abedi': 0.341,
 'children': 0.161,
 'feminism': 0.34,
 'cents': 0.337,
 'airline': 0.337,
 'muslim': 0.324,
 'que': 0.163,
 'wage': 0.322,
 'madrid': 0.14,
 'families': 0.321,
 'extradition': 0.321,
 'flag': 0.314,
 'vagina': 0.314,
 'parent': 0.314,
 'ger

### 4.2 Real news relevant words
It needs to do the same process but with real news articles.

In [46]:
tf_words_real_train = tfidf_model_real.transform(cv_train_real)
topn = 200
results_real, top_real_words = tf.get_topn_relevant_words(cv, tf_words_real_train, topn)

In [47]:
results_real

{'bitcoin': 0.766,
 'mw': 0.578,
 'columbus': 0.495,
 'attack': 0.478,
 'purchase': 0.264,
 'bbva': 0.406,
 'maroto': 0.442,
 'vox': 0.439,
 'education': 0.437,
 'bettong': 0.426,
 'degree': 0.423,
 'valeria': 0.418,
 'frb': 0.414,
 'burst': 0.414,
 'passengers': 0.409,
 'calviño': 0.406,
 'manicure': 0.4,
 'listen': 0.399,
 'operations': 0.382,
 'sesame': 0.379,
 'netflix': 0.355,
 'price': 0.237,
 'catalonia': 0.353,
 'borrell': 0.351,
 'waistcoats': 0.351,
 'liceu': 0.349,
 'conservatory': 0.349,
 'statue': 0.347,
 'bet': 0.344,
 'liter': 0.341,
 'abascal': 0.334,
 'choke': 0.333,
 'congress': 0.244,
 'death': 0.328,
 'survivors': 0.326,
 'political': 0.324,
 'motion': 0.324,
 'washington': 0.323,
 'panda': 0.323,
 'giant': 0.323,
 'syria': 0.321,
 'ambulances': 0.321,
 'cuba': 0.32,
 'movistar': 0.319,
 'workers': 0.318,
 'yellow': 0.318,
 'authority': 0.317,
 'january': 0.315,
 'unemployment': 0.311,
 'artists': 0.305,
 'disinformation': 0.305,
 'hathaway': 0.303,
 'moreno': 0.303

### 4.4 Transform to TF-IDF test data for future

In [None]:
# But first translate to tfdif encoding test data
# 1. Split
df_test_real = df_test.loc[df_test['label'] == 0]
df_test_fake = df_test.loc[df_test['label'] == 1]
# 2. To count vectorizer
cv_test_real = cv.transform(df_test_real['text'].values)
cv_test_fake = cv.transform(df_test_fake['text'].values)

# 3. To tfidf
tf_words_real_test = tfidf_model_real.transform(cv_test_real)
tf_words_fake_test = tfidf_model_fake.transform(cv_test_fake)

## 5 Compute cosine similarity

In [26]:
from sklearn.metrics.pairwise import cosine_similarity as cs

def get_cosine_similarity(q1_csc, q2_csc):
    cosine_sim = []
    for i,j in zip(q1_csc, q2_csc):
        sim = cs(i,j)
        cosine_sim.append(sim[0][0])
    
    return cosine_sim
    

In [43]:
top_real_words_coded = cv.transform([' '.join(top_real_words)])
top_fake_words_coded = cv.transform([' '.join(top_fake_words)])
df_train['cv_value'] = cv.transform(df_train['text'])

In [44]:
df_train['cos_fake'] = df_train['label']*0.000000000005
df_train['cos_real'] = df_train['label']*0.000000000005
for index, row in df_train.iterrows():
    to_number = cv.transform([row['text']])
    cosine_sim_fake = get_cosine_similarity(top_fake_words_coded, to_number)
    cosine_sim_real = get_cosine_similarity(top_real_words_coded, to_number)
    df_train.at[index,'cos_fake'] = cosine_sim_fake[0]
    df_train.at[index,'cos_real'] = cosine_sim_real[0]

In [45]:
df_train[['cos_real', 'cos_fake']].head()

Unnamed: 0,cos_real,cos_fake
10,0.030054,0.154907
130,0.089502,0.147831
51,0.01913,0.108463
11,0.024804,0.142764
114,0.035079,0.141631


## 6. Classification
### 6.1 Get optimal top_n words to classify with SVM

After show the process of how create two list of relevant words for each type of news, it is now implement on function `get_relevant_word_lists` which return both lists, from real and from fake. The important input for the function is `top_n` where will be variable and test on SVM models to see with which value the model can classify better the articles.

Now we process to implement a loop of SVM trainings:

In [None]:
top_n_values = list(range(50, 4000, 50))

In [None]:

y_train = df_train['label'].values
y_test = df_test['label'].values
X_train = df_train[['cos_real', 'cos_fake']].values
X_test = df_test[['cos_real', 'cos_fake']].values
    
all_scores = []
for top_n in top_n_values:
    res, top_real_words = t.get_topn_relevant_words(cv, tf_train_real, top_n)
    res, top_fake_words = t.get_topn_relevant_words(cv, tf_train_fake, top_n)
    # Encode top words
    cv_top_real_words = cv.transform([' '.join(top_real_words)])
    cv_top_fake_words = cv.transform([' '.join(top_fake_words)])
    
    # Cosine similarity of train
    compute_similarity(df_train, cv, cv_top_real_words, cv_top_fake_words)
    # Cosine similarity of text
    compute_similarity(df_test, cv, cv_top_real_words, cv_top_fake_words)
    
    # Update X values
    X_train = df_train[['cos_real', 'cos_fake']].values
    X_test = df_test[['cos_real', 'cos_fake']].values
    # Run SVM
    scores = svm_controller.run_optimals_models(X_train, y_train, X_test, y_test, False)
    scores.append(top_n) 
    all_scores.append(scores)
    


In [None]:
df_scores = pd.DataFrame(data=all_scores, columns=['kernel_training', 'training', 'kernel_validation', 'validation', 'top_n'])    

In [None]:
df_scores.sort_values(['validation'], ascending=[True])

In [None]:
import matplotlib.pyplot as plt
from  matplotlib.colors import ListedColormap
X_set, y_set = X_test, Y_test

aranged_pc1 = np.arange(start = X_set[:, 0].min(), stop = X_set[:, 0].max(), step = 0.01)
aranged_pc2 = np.arange(start = X_set[:, 1].min(), stop = X_set[:, 1].max(), step = 0.01)

X1, X2 = np.meshgrid(aranged_pc1, aranged_pc2)
plt.contourf(X1, X2, models['rbf'].predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.5, cmap = ListedColormap(('orange', 'green')))

plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('SVM prediction from LDA transformation')
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.legend()
plt.show()