In [None]:
import openai
import os
import pandas as pd # data manipulation and analysis
import numpy as np # python lib for working with arrays, linear algebra
import seaborn as sns # data exploration and visualisation
import matplotlib.pyplot as plt # data visulisation and graphical charting
from sklearn.model_selection import train_test_split # create 2 subsets of the data (training and testing)
from sklearn.metrics import accuracy_score # accuracy of correctly classified among all samples.
from sklearn.metrics import classification_report # generating a human-readable text report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string 
import re # determine if a given text fits the given regular expression 

In [None]:
data_fake = pd.read_csv('Datasets/Fake.csv')
data_true = pd.read_csv('Datasets/True.csv')
data_fake.head()

In [None]:
data_true.head()

Add a label column to the data and concatenate the two datasets

In [None]:
data_fake['label'] = 0
data_true['label'] = 1
data = pd.concat([data_fake, data_true])

In [None]:
data_fake.shape, data_true.shape # returns the shape of an array which is a tuple of integers containing entries 

data_fake contains 23481 rows and 5 columns
data_true contains 21417 rows and 5 columns

In [None]:
print(data_fake.index)
print(data_true.index)

Split the data into training and testing sets
- data: the dataset to split into training and testing sets
- test_size: the proportion of the dataset to use for testing (e.g. 0.2 for 20%)
- random_state: the seed used by the random number generator for reproducibility

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data['text'],  # the feature (text) to be used for training and testing
    data['label'],  # the target (label) to be predicted for training and testing
    test_size=0.2,  # use 20% of the data for testing
    random_state=42  # use a fixed seed for reproducibility
)

Train the TF-IDF vectorizer

In [None]:
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)

Set up OpenAI API

In [None]:
openai.api_key = os.environ["OPENAI_API_KEY"]

Prompts to generate a news article based on user input which asks for what the news should be about.

In [None]:
about = input("What would you like news about?\n")
article = openai.Completion.create(
    engine="text-davinci-002",
    prompt= "Genrate a news article about the following" + about,
    max_tokens=2048,
    n=1,
    stop=None,
    temperature=0.7,
)

Vectorize the generated article and test data

- Vectorization is a process of converting text data into a format that can be used for machine learning.
- We are converting the article's text into a numerical format that the computer can work with.
- Vectorize the test data, to convert the text of the news articles in the test set into numerical format.

In [None]:
article_vect = vectorizer.transform([article.choices[0].text])
X_test_vect = vectorizer.transform(X_test)

The dot product helps measure the similarity between the generated article and each article in the test data by multiplying their corresponding numerical vectors together and adding up the results.

In [None]:
similarities = article_vect.dot(X_test_vect.T)

Get the index of the most similar article

In [None]:
most_similar_index = similarities.argmax()

Get the label of the most similar article

In [None]:
y_pred = y_test.iloc[most_similar_index]

- Print the generated article
- Test the accuracy of the generated article
- Print the classification report

The line similarities.argmax() returns the index of the test article that is most similar to the generated article.

If i is equal to the index of the most similar article, it sets the prediction to 1 (indicating that the article is true), otherwise, it sets the prediction to 0 (indicating that the article is fake).

In [75]:
print("\nGenerated news: \n", article.choices[0].text)

# predict the label based on the similarity score
y_pred = [1 if i == similarities.argmax() else 0 for i in range(len(X_test))]

print("\nAccuracy: ", y_pred == 1)

print("\nClassification report: \n", classification_report(y_test, y_pred))



Generated news: 
 

Dolphins are transforming into other species, according to a new study.

Scientists have long known that dolphins are some of the most intelligent animals on the planet. But a new study has found that they may be even more flexible than we thought, with the ability to change their appearance to match other species.

The study, published in the journal BMC Evolutionary Biology, looked at two groups of dolphins in the Gulf of Mexico. One group had been exposed to other dolphin species for a long time, while the other had been isolated from them.

The researchers found that the dolphins in the first group had changed their appearance to match the other species, while the dolphins in the second group had not. This suggests that dolphins have the ability to change their appearance to fit in with their surroundings.

The study's lead author, Dr. Arianna Di Loreto, said that the findings could have important implications for conservation. "If dolphins can change their app