# Importing the libs

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import contractions
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
import pickle

# !pip install contractions
nltk.download('stopwords')

# Defined methods

In [None]:
def clean_text(text):
    # \S matches any character that is not a space tab newline
    text = re.sub(r'http\S+', '', text)
    #remove emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    # igore => specifies how to handle characters that cannot be represented in ASCII
    # remove htmk tags  => . any char except newline
    text = re.sub(r'<.*>', '', text)
    # remove punctiuations
    text = re.sub(r'[^\w\s]', '', text)
    return text

def remove_stopwords(text):
    words = word_tokenize(text)
    english_stopwords = set(stopwords.words('english'))
    filtered_words = []
    for word in words:
        if word.lower() not in english_stopwords:
            filtered_words.append(word)
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def spell_check_and_correction(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text

def to_lower(s):
    return s.lower()

def expanding_contractions(text):
    return contractions.fix(text)

def lemmatization(text):
    text_tokenized = []
    
    words = nltk.word_tokenize(text) #Tokenize

    lemmatizer = WordNetLemmatizer()
    
    for word in words:
        text_tokenized.append(lemmatizer.lemmatize(word)) #lemmatize
    
    text_tokenized = ' '.join(text_tokenized)
    
    return text_tokenized

In [None]:
def preprocess_text(text):
    text = clean_text(text)
#     text = spell_check_and_correction(text)
    text = expanding_contractions(text)
    text = remove_stopwords(text)
    text = to_lower(text)
    text = lemmatization(text)
    return text

# Testing the methods
# print(preprocess_text("print they should pay all the back all the money plus interest the entire family and everyone who came in with them need to be deported asap why didn't it take two years to bust them here we go again another group stealing from the government and taxpayers a group of somalis stole over four million in government benefits over just  months weve reported on numerous cases like this one where the muslim refugeesimmigrants commit fraud by scamming our systemits way out of control more related"))

# Data analysis

In [None]:
df = pd.read_csv("..\..\dataset.csv", encoding = "latin1")
df

In [None]:
# specify a small information about the data
df.info()

In [None]:
# display the number of each category with its data type
df["label"].value_counts()

In [None]:
# display the number of the nan values in each column
df.isna().sum()

In [None]:
# display the number of the duplicated rows
df.duplicated().sum()

In [None]:
# display the minimum and maximum number of words in a row of the dataset
word_count = df['text'].str.split().str.len()
print(f"The minimum number of words: {int(word_count.min())}")
print(f"The maximum number of words: {int(word_count.max())}")

# Data preprocssing

In [None]:
# remove nan and -inf values
df = df.dropna()
df.isna().sum()

In [None]:
# change the type of the label column to int
df['label'] = df['label'].astype(int)
df['label'].value_counts()

In [None]:
# drop the duplicates
df = df.drop_duplicates()
df.duplicated().sum()

In [None]:
# shuffle the rows
df = df.sample(frac=1)
df = df.reset_index(drop=True)
df

In [None]:
df["text"] = df["text"].apply(preprocess_text)
df = df.dropna()
df

In [None]:
# filter the rows with less than 20 words
word_count = df['text'].str.split().str.len()
print(f"The minimum number of words before filtering: {int(word_count.min())}")
df = df[df['text'].str.split().str.len().gt(99) & df['text'].str.split().str.len().lt(581)]
word_count = df['text'].str.split().str.len()
print(f"The minimum number of words before filtering: {int(word_count.min())}")

In [None]:
plt.figure(figsize=(15, 5))
plt.boxplot(df['text'].str.split().str.len(), labels=["Series 1"], vert=False, patch_artist=True)
plt.xlabel("Series Name")
plt.ylabel("Value")
plt.title("Boxplot of Series Data")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability (optional)
plt.grid(True)  # Add grid lines (optional)
# Customize box appearance (optional)
boxes = plt.gca().get_children()[0]  # Get boxplot artist
boxes.set_facecolor('lightblue')  # Set box color
plt.show()

In [None]:
import plotly.express as px
# Count the occurrences of 0 and 1 in the 'label' column
label_counts = df['label'].value_counts()
# Extract the actual values (0 and 1) as a list
label_values = label_counts.index.to_list()
# Extract the counts (occurrences) as a list
value_counts = label_counts.to_list()
# Create the pie chart
fig = px.pie(values=value_counts, names=label_values)
fig.update_traces(hoverinfo='label+percent',
                  textinfo='percent', 
                  textfont_size=20,
                  marker=dict(colors=['gold', 'mediumturquoise'], 
                              line=dict(color='#000000', width=2)))
fig.update_layout(
    title_text="Label column pie chart",
    title_font_color="white",
    paper_bgcolor="black",
    font_color="white") 
# Add data labels with percentages (optional)
# fig.update_traces(textposition='inside', textinfo='percent+n')  # Adjust position and content
fig.show()

In [None]:
df

In [None]:
df.to_csv("..\..\df_processed.csv", index=False)

In [None]:
df = pd.read_csv("..\..\df_processed.csv")
# df = df.dropna()
df

# TF-IDF

In [7]:
# vect = TfidfVectorizer()
# tfidf = vect.fit_transform(df['text'].values)
# print(tfidf)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(df['text'].values)
tfidf = transformer.fit_transform(counts)
print(tfidf)

  (0, 6612036)	0.0619587354084424
  (0, 6611593)	0.06429091619755256
  (0, 6611432)	0.04724602950799684
  (0, 6585801)	0.02574934413884835
  (0, 6585176)	0.011437539463616937
  (0, 6529870)	0.06649099809187176
  (0, 6528747)	0.06795006078307725
  (0, 6525786)	0.031018958332892397
  (0, 6402655)	0.06649099809187176
  (0, 6400758)	0.025813994387668692
  (0, 6211160)	0.05511065908561874
  (0, 6210215)	0.023115318086619978
  (0, 6159496)	0.02875546972828243
  (0, 6155127)	0.013609568528639909
  (0, 6053597)	0.07701458614993596
  (0, 6053560)	0.05759201758990124
  (0, 5985533)	0.07701458614993596
  (0, 5981079)	0.043281211088827556
  (0, 5977559)	0.023006644621119527
  (0, 5966832)	0.06717991604005859
  (0, 5964733)	0.019127101425529987
  (0, 5946362)	0.06264765335662922
  (0, 5945633)	0.024172464334061027
  (0, 5828282)	0.07701458614993596
  (0, 5828237)	0.042671688236287725
  :	:
  (95938, 320111)	0.0264313033197721
  (95938, 308328)	0.04406000709609513
  (95938, 308024)	0.061854135642587

In [None]:
vect.vocabulary_

# Spliting to training set and test set

In [8]:
x_train, x_test, y_train, y_test = train_test_split(tfidf, df['label'], test_size=0.2 , shuffle=False)

In [9]:
def train(model , name):
    model.fit(x_train,y_train)
    print(f"Training accuracy of {name} is {round(model.score(x_train, y_train) * 100, 3)}%")
    print(f"Testing accuracy of {name} is {round(model.score(x_test, y_test) * 100, 3)}%")
    print()
    return round(model.score(x_test, y_test) * 100, 3)
    
def conf_matrix(model):
    ConfusionMatrixDisplay.from_estimator(model, x_test, y_test)
    
def class_report(model):
    print(classification_report(y_test, model.predict(x_test)))

# Passive Aggresive Classifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter = 1000)
pac_acc = train(pac, "Passive Aggresive Classifier")
conf_matrix(pac)
class_report(pac)

# Logistic Regression

In [None]:
lr = LogisticRegression()
lr_acc = train(lr, "Logistic Regression")
conf_matrix(lr)
class_report(lr)

# Stochastic Gradient Descent

In [None]:
sgd = SGDClassifier(loss='modified_huber', alpha=0.0001, max_iter=1000)
sgd_acc = train(sgd, "Stochastic Gradient Descent")
conf_matrix(sgd)
class_report(sgd)

# Naive Bayes

### 1- Multinomial NB

In [None]:
mnb = MultinomialNB(alpha=0.8, fit_prior=True, force_alpha=True)
mnb_acc = train(mnb, "Multinomial NB")
conf_matrix(mnb)
class_report(mnb)

### 2- Complement NB

In [None]:
cnb = ComplementNB()
cnb_acc = train(cnb, "Complement NB")
conf_matrix(cnb)
class_report(cnb)

# Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt_acc = train(dt, "Decision Tree")
conf_matrix(dt)
class_report(dt)

# Support Vector Classification

In [None]:
svc = SVC(kernel="rbf", gamma=0.1, C=0.1)
svc_acc = train(svc, "Support Vector Classification")
conf_matrix(svc)
class_report(svc)

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=5, random_state=42)
rf_acc = train(rf, "Random Forest")
conf_matrix(rf)
class_report(rf)

# Choosing the best model

In [None]:
models = ["Passive Aggressive", "Logistic Regression", "Decision Tree",
                   "Random Forest", "SGD", "SVC", "Multinomial NB", "Complement NB"]
# dt_acc = 82.9
# rf_acc = 84
# svc_acc = 80

# pac_acc = 81.5
# lr_acc = 85.8
# sgd_acc = 86.2
# mnb_acc = 78.8
# cnb_acc = 78.63
accuracies = [pac_acc, lr_acc, dt_acc, rf_acc, sgd_acc, svc_acc, mnb_acc,
                           cnb_acc]
colors = ["blue", "green", "orange", "yellow", "red", "violet", "black", "gold", "mediumturquoise"]

plt.figure(figsize=(12.5, 5))
plt.bar(models, accuracies, color=colors)
plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.title("Model Accuracy Comparison")
for i in range(len(models)):
        plt.text(i, accuracies[i], accuracies[i], ha = 'center')
plt.tight_layout()
plt.show()

### Clearly SGD has the hightest accuracy so we will save it for deployment

# Save the TF-IDF and model objects

In [None]:
pickle.dump(vect, open('vect.pkl', 'wb'))
vect = pickle.load(open('vect.pkl', 'rb'))

In [None]:
pickle.dump(lr, open('sgd.pkl', 'wb'))
model = pickle.load(open('sgd.pkl', 'rb'))

# Classifing a single news article

In [None]:
def fake_news(article):
    article = preprocess_text(article)
    article = [article]
    tfidf = vect.transform(article)
    prediction = model.predict(tfidf)
    return prediction[0]

In [None]:
ans = fake_news("""Andrea Tantaros, a former Fox News host, charged in a lawsuit filed Monday that top executives at the network, including the man who replaced Roger Ailes, punished her for complaining about sexual harassment by Mr. Ailes. The suit by Ms. Tantaros, filed in New York State Supreme Court in Manhattan, is the latest round in a contentious volley that began in late winter, when Fox claimed she had breached her employment contract by writing a book without receiving network approval. â€œFox News masquerades as a defender of traditional family values, but behind the scenes, it operates like a   Playboy   cult, steeped in intimidation, indecency and misogyny,â€ Ms. Tantarosâ€™s suit says. Fox News said it would not comment on pending litigation. Mr. Ailes, the networkâ€™s founding chairman and guiding force for two decades, resigned last month after a former anchor, Gretchen Carlson, said in a suit that she was fired for refusing his sexual advances. Mr. Ailes has denied all allegations of harassment. In April, the chief lawyer for Fox charged that Ms. Tantaros had concocted sexual harassment claims to gain leverage in the contract dispute her lawyer, Judd Burstein, said the book dispute was a pretext that Fox was using to silence her. During arbitration, Mr. Burstein said, Fox News offered to pay her a sum â€œin the seven figuresâ€ if she renounced claims against Mr. Ailes and others at the network, including the host Bill Oâ€™Reilly. According to the lawsuit, Ms. Tantaros said she had been subjected to unwelcome advances from Mr. Oâ€™Reilly, whom she had regarded as a friend and adviser. â€œAiles did not act alone,â€ the lawsuit states. â€œHe may have been the primary culprit, but his actions were condoned by his most senior lieutenants, who engaged in a concerted effort to silence Tantaros by threats, humiliation and retaliation. â€ Ms. Tantaros also claimed in the lawsuit that she was the subject of humiliating posts by pseudonymous accounts on Twitter known as â€œsock puppetsâ€ that she says were instigated by the Fox News publicity department. Ms. Tantaros joined Fox as a contributor in 2010, and a year later was named   of â€œThe Five,â€ which aired at 5 p. m. She said in the suit that she was repeatedly told by Fox executives that she could not wear pants on the air because â€œRoger wants to see your legs. â€ The lawsuit goes on to say that on Aug. 12, 2014, Mr. Ailes called her into his office and asked if she was planning to marry and have children. â€œAiles then started complaining about marriage in general, and also made   jokes about being married,â€ the lawsuit states. It describes Mr. Ailes as speculating on the sexual habits and preferences of 10 Fox News personalities. He asked Ms. Tantaros to turn around â€œso I can get a good look at you,â€ the lawsuit charges, adding that Ms. Tantaros refused. Soon after, she was moved from â€œThe Fiveâ€ to a   show, â€œOutnumbered,â€ that aired at midday. Mr. Ailes called her back for similar sessions in December 2014 and February 2015, the lawsuit charges, and when she continued to rebuff him, she encountered hostility from the Fox News publicity department. In the February meeting, she said, Mr. Ailes talked about how she would look in a bikini, and accused her of ending a   relationship because she had been merely using the man. The episode brought her to tears, the lawsuit states. She said the sole interview arranged by the publicity department during that period was with a writer for a blog controlled by Fox, who asked about her breasts and if she was difficult to work with. In April 2015, the lawsuit states, Ms. Tantaros met with Bill Shine, then a senior news executive and close aide to Mr. Ailes. She said that she told him about the meetings with Mr. Ailes and asked if he had told the head of publicity for Fox News, Irena Briganti, to go after her. The lawsuit claims that Mr. Shine â€œtold Tantaros that Briganti is like a rabid dog on a chain that we canâ€™t control. Sometimes that dog gets off the chain. â€ Then, pointing to a picture of Mr. Ailes on a magazine cover, the lawsuit charges, Mr. Shine told her that â€œthis powerful man has faith in Irena Brigantiâ€ and that Ms. Tantaros â€œneeds to let this one go. â€ Mr. Shine, through a spokeswoman, has said that Ms. Tantaros never approached him about Mr. Ailes harassing her. Mr. Shine was named   of Fox News after Mr. Ailes departed.""")
if(ans == 0):
    print("FALSE NEWS!!!")
else:
    print("TRUE NEWS")
# false

In [None]:
ans = fake_news("""House Dem Aide: We Didnâ€™t Even See Comeyâ€™s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) 
With apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this weekâ€“FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clintonâ€™s email server, the ranking Democrats on the relevant committees didnâ€™t hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. 
As we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was reviewing emails it had recently discovered in order to see if they contained classified information. Not long after this letter went out, Oversight Committee Chairman Jason Chaffetz set the political world ablaze with this tweet. FBI Dir just informed me, "The FBI has learned of the existence of emails that appear to be pertinent to the investigation." Case reopened 
â€” Jason Chaffetz (@jasoninthehouse) October 28, 2016 
Of course, we now know that this was not the case . Comey was actually saying that it was reviewing the emails in light of â€œan unrelated caseâ€â€“which we now know to be Anthony Weinerâ€™s sexting with a teenager. But apparently such little things as facts didnâ€™t matter to Chaffetz. The Utah Republican had already vowed to initiate a raft of investigations if Hillary winsâ€“at least two yearsâ€™ worth, and possibly an entire termâ€™s worth of them. Apparently Chaffetz thought the FBI was already doing his work for himâ€“resulting in a tweet that briefly roiled the nation before cooler heads realized it was a dud. 
But according to a senior House Democratic aide, misreading that letter may have been the least of Chaffetzâ€™ sins. That aide told Shareblue that his boss and other Democrats didnâ€™t even know about Comeyâ€™s letter at the timeâ€“and only found out when they checked Twitter. â€œDemocratic Ranking Members on the relevant committees didnâ€™t receive Comeyâ€™s letter until after the Republican Chairmen. In fact, the Democratic Ranking Members didnâ€™ receive it until after the Chairman of the Oversight and Government Reform Committee, Jason Chaffetz, tweeted it out and made it public.â€ 
So letâ€™s see if weâ€™ve got this right. The FBI director tells Chaffetz and other GOP committee chairmen about a major development in a potentially politically explosive investigation, and neither Chaffetz nor his other colleagues had the courtesy to let their Democratic counterparts know about it. Instead, according to this aide, he made them find out about it on Twitter. 
There has already been talk on Daily Kos that Comey himself provided advance notice of this letter to Chaffetz and other Republicans, giving them time to turn on the spin machine. That may make for good theater, but there is nothing so far that even suggests this is the case. After all, there is nothing so far that suggests that Comey was anything other than grossly incompetent and tone-deaf. 
What it does suggest, however, is that Chaffetz is acting in a way that makes Dan Burton and Darrell Issa look like models of responsibility and bipartisanship. He didnâ€™t even have the decency to notify ranking member Elijah Cummings about something this explosive. If that doesnâ€™t trample on basic standards of fairness, I donâ€™t know what does. 
Granted, itâ€™s not likely that Chaffetz will have to answer for this. He sits in a ridiculously Republican district anchored in Provo and Orem; it has a Cook Partisan Voting Index of R+25, and gave Mitt Romney a punishing 78 percent of the vote in 2012. Moreover, the Republican House leadership has given its full support to Chaffetzâ€™ planned fishing expedition. But that doesnâ€™t mean we canâ€™t turn the hot lights on him. After all, he is a textbook example of what the House has become under Republican control. And he is also the Second Worst Person in the World. About Darrell Lucus 
Darrell is a 30-something graduate of the University of North Carolina who considers himself a journalist of the old school. An attempt to turn him into a member of the religious right in college only succeeded in turning him into the religious right's worst nightmare--a charismatic Christian who is an unapologetic liberal. His desire to stand up for those who have been scared into silence only increased when he survived an abusive three-year marriage. You may know him on Daily Kos as Christian Dem in NC . Follow him on Twitter @DarrellLucus or connect with him on Facebook . Click here to buy Darrell a Mello Yello. Connect""")
if(ans == 0):
    print("FALSE NEWS!!!")
else:
    print("TRUE NEWS")
# true