In [46]:
import openai
import os
import pandas as pd # data manipulation and analysis
import numpy as np # python lib for working with arrays, linear algebra
from sklearn.model_selection import train_test_split # create 2 subsets of the data (training and testing)
from sklearn.metrics import accuracy_score # accuracy of correctly classified among all samples.
from sklearn.metrics import classification_report # generating a human-readable text report
from sklearn.feature_extraction.text import TfidfVectorizer
import string 
import re # determine if a given text fits the given regular expression.
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [34]:
data_fake = pd.read_csv('Datasets/Fake.csv')
data_true = pd.read_csv('Datasets/True.csv')
data_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [35]:
data_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


Add a label column to the data and concatenate the two datasets

In [36]:
data_fake['label'] = 0
data_true['label'] = 1

Drop all other columns except text and label

In [37]:
data_true = data_true[['text','label']]
data_fake = data_fake[['text','label']]

Concat and merge the 2 datasets

In [38]:
data = pd.concat([data_fake, data_true])

In [39]:
data.tail()

Unnamed: 0,text,label
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1
21416,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,1


In [40]:
data_fake.shape, data_true.shape # returns the shape of an array which is a tuple of integers containing entries 

((23481, 2), (21417, 2))

data_fake contains 23481 rows and 2 columns
data_true contains 21417 rows and 2 columns

In [41]:
print(data_fake.index)
print(data_true.index)

RangeIndex(start=0, stop=23481, step=1)
RangeIndex(start=0, stop=21417, step=1)


Clean datasets

In [42]:
data.isnull().sum() # returns the number of missing values in each column

text     0
label    0
dtype: int64

In [43]:
data['label'].value_counts() # returns the number of unique values (0 and 1) in the column

0    23481
1    21417
Name: label, dtype: int64

Shuffles the data to avoid bias

In [44]:
data = data.sample(frac = 1)

- Remove stopwords 
- Remove non-alphabetic characters with (" ")
- convert to lower case
- Convert to base words

In [48]:
lemmatizer = WordNetLemmatizer() # convert all the words to their base word or root word. eg. running -> run
stopwords = stopwords.words('english')

Function to preprocess the data

In [49]:
def clean_data(text):
    text = text.lower() # convert all the words to lower case
    text = re.sub('[^a-zA-Z]' , ' ' , text) # remove all the special characters
    token = row.split() # split the text into words
    token = [lemmatizer.lemmatize(word) for word in token if not word in stopwords]  # remove all the stopwords
    clean_news = ' '.join(news) # join the words to form a sentence
    
    return clean_news 

In [None]:
data['text'] = data['text'].apply(lambda x : clean_data(x)) # apply the clean_data function to the text column

# For testing purposes later:
#for i in range(len(data['text'])):
#    data['text'][i] = clean_data(data['text'][i])

- Count vectorization (frequency of words in the text) followed by the Tf-Idf transformation
- Term Frequency (TF) is the number of times a term/word occurs in the document divided by the number of words in the document.
- Inverse Document Frequency (IDF) is the number of documents divided by the number of documents containing the word with log applied to it.
- The top 5000 most frequent features will be included in the output matrix.
- The value is the product of Term Frequency and the Inverse Document Frequency.
- ngram_range sets the range of n-grams to be included in the feature matrix. In this case, it is set to include both unigrams (single words) and bigrams (pairs of consecutive words) in the output.


In [None]:
vectorizer = TfidfVectorizer(max_features  =  5000, lowercase=False, ngram_range=(1,2) 

Split the data into training and testing sets
- data: the dataset to split into training and testing sets
- test_size: the proportion of the dataset to use for testing (e.g. 0.2 for 20%)
- random_state: the seed used by the random number generator for reproducibility

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data['text'],  # the feature (text) to be used for training and testing
    data['label'],  # the target (label) to be predicted for training and testing
    test_size=0.2,  # use 20% of the data for testing
    random_state=42  # use a fixed seed for reproducibility
)

Train the TF-IDF vectorizer

Set up OpenAI API

In [None]:
openai.api_key = os.environ["OPENAI_API_KEY"]

Prompts to generate a news article based on user input which asks for what the news should be about.

In [None]:
about = input("What would you like news about?\n")
article = openai.Completion.create(
    engine="text-davinci-002",
    prompt= "Genrate a news article about the following " + about,
    max_tokens=2048,
    n=1,
    stop=None,
    temperature=0.7,
)

Vectorize the generated article and test data

- Vectorization is a process of converting text data into a format that can be used for machine learning.
- We are converting the article's text into a numerical format that the computer can work with.
- Vectorize the test data, to convert the text of the news articles in the test set into numerical format.

In [None]:
article_vect = vectorizer.transform([article.choices[0].text])
X_test_vect = vectorizer.transform(X_test)

The dot product helps measure the similarity between the generated article and each article in the test data by multiplying their corresponding numerical vectors together and adding up the results.

In [None]:
similarities = article_vect.dot(X_test_vect.T)

Get the index of the most similar article

In [None]:
most_similar_index = similarities.argmax()

Get the label of the most similar article

In [None]:
y_pred = y_test.iloc[most_similar_index]

- Print the generated article
- Test the accuracy of the generated article
- Print the classification report

In [None]:
print("\nGenerated news: \n", article.choices[0].text)

# predict the label based on the similarity score
y_pred = [1 if i == similarities.argmax() else 0 for i in range(len(X_test))]

# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
threshold = 0.5
accuracy_bool = accuracy > threshold
print("Accuracy: ", accuracy_bool)

print("\nClassification report: \n", classification_report(y_test, y_pred))