# Importing the libs

In [1]:
import numpy as np
import pandas as pd
import re
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score , ConfusionMatrixDisplay , classification_report , roc_curve
from lazypredict.Supervised import LazyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import pickle
import nltk
import contractions
# !pip install lazypredict
# !pip install contractions
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alihi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Defined methods

In [2]:
def clean_text(text):
    # \S matches any character that is not a space tab newline
    text = re.sub(r'http\S+', '', text)
    #remove emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    # igore => specifies how to handle characters that cannot be represented in ASCII
    # remove htmk tags  => . any char except newline
    text = re.sub(r'<.*>', '', text)
    # remove punctiuations
    text = re.sub(r'[^\w\s]', '', text)
    return text

def remove_stopwords(text):
    words = word_tokenize(text)
    english_stopwords = set(stopwords.words('english'))
    filtered_words = []
    for word in words:
        if word.lower() not in english_stopwords:
            filtered_words.append(word)
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def spell_check_and_correction(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text

def to_lower(s):
    return s.lower()

def expanding_contractions(text):
#     text = contractions.fix(text)
    return contractions.fix(text)

def lemmatization(text):
    text_tokenized = []
    
    words = nltk.word_tokenize(text) #Tokenize

    for word in words:
        text_tokenized.append(lemmatizer.lemmatize(word)) #lemmatize
    
    text_tokenized = ' '.join(text_tokenized)
    
    return text_tokenized

In [3]:
def preprocess_text(text):
    text = clean_text(text)
#     text = spell_check_and_correction(text)
    text = expanding_contractions(text)
    text = remove_stopwords(text)
    text = to_lower(text)
    text = lemmatization(text)
    return text

# Testing the methods
# print(preprocess_text("print they should pay all the back all the money plus interest the entire family and everyone who came in with them need to be deported asap why didn't it take two years to bust them here we go again another group stealing from the government and taxpayers a group of somalis stole over four million in government benefits over just  months weve reported on numerous cases like this one where the muslim refugeesimmigrants commit fraud by scamming our systemits way out of control more related"))

# Data analysis

In [23]:
df = pd.read_csv("..\\dataset.csv",encoding = "latin1")
df.head(10)

Unnamed: 0,text,label
0,Print They should pay all the back all the mon...,0.0
1,Why Did Attorney General Loretta Lynch Plead T...,0.0
2,Red State : \nFox News Sunday reported this mo...,0.0
3,Email Kayla Mueller was a prisoner and torture...,0.0
4,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,0.0
5,Print Hillary goes absolutely berserk! She exp...,0.0
6,BREAKING! NYPD Ready To Make Arrests In Weiner...,0.0
7,BREAKING! NYPD Ready To Make Arrests In Weiner...,0.0
8,\nLimbaugh said that the revelations in the Wi...,0.0
9,Email \nThese people are sick and evil. They w...,0.0


In [24]:
# specify a small information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156790 entries, 0 to 156789
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   text    153247 non-null  object 
 1   label   151791 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.4+ MB


In [25]:
# display the number of each category with its data type
df["label"].value_counts()

label
0.00    84308
1.00    67483
Name: count, dtype: int64

In [26]:
# display the number of the nan values in each column
df.isna().sum()

text     3543
label    4999
dtype: int64

In [27]:
# display the number of the duplicated rows
df.duplicated().sum()

16621

In [28]:
word_count = int(df['text'].str.split().str.len())
print(f"The minimum number of words: {word_count.min()}")
print(f"The maximum number of words: {word_count.max()}")

The minimum number of words: 0.0
The maximum number of words: 5965.0


In [29]:
# true = np.count_nonzero(df['label'] == 1)
# false = len(df['label']) - true
# plt.pie([true , false], labels= ["TRUE" , "FALSE"], autopct="%1.1f%%" )
# plt.show(block=False)

# Data preprocssing

In [30]:
# remove nan and -inf values
df = df.dropna()
df.isna().sum()

text     0
label    0
dtype: int64

In [31]:
# change the type of the label column to int
df['label'] = df['label'].astype(int)
df['label'].value_counts()

label
0    84232
1    67401
Name: count, dtype: int64

In [32]:
# drop the duplicates
df = df.drop_duplicates()
df.duplicated().sum()

0

In [33]:
# shuffle the rows
df = df.sample(frac=1)
df = df.reset_index(drop=True)
df

Unnamed: 0,text,label
0,"In 2014, I wrote an article about New Hampshi...",0
1,War on Christmas update: Some very clever peop...,0
2,trump possible dhs pick threaten destroy medic...,0
3,sense giant earthquake happen today report cop...,0
4,trump putin discuss support un peace process s...,1
...,...,...
138665,unprecedented endorsement nonpartisan humane s...,0
138666,WASHINGTON (Reuters) - President Donald Trump ...,1
138667,Republicans are now gunning for the Ninth Circ...,0
138668,Reuters \nU.S. Defense Secretary Ash Carter on...,1


In [34]:
df["text"] = df["text"].apply(preprocess_text)
df = df.dropna()
df

Unnamed: 0,text,label
0,2014 wrote article new hampshire swingy congre...,0
1,war christmas update clever people asking join...,0
2,trump possible dhs pick threaten destroy medic...,0
3,sense giant earthquake happen today report cop...,0
4,trump putin discus support un peace process sy...,1
...,...,...
138665,unprecedented endorsement nonpartisan humane s...,0
138666,washington reuters president donald trump said...,1
138667,republican gunning ninth circuit court effort ...,0
138668,reuters u defense secretary ash carter wednesd...,1


In [35]:
# filter the rows with less than 20 words
print(f"The minimum number of words before filtering: {word_count.min()}")
df = df[word_count >= 200]
word_count = df['text'].str.split().str.len() # int(df['text'].str.split().str.len())
print(f"The minimum number of words before filtering: {word_count.min()}")

The minimum number of words before filtering: 0.0
The minimum number of words before filtering: 0


In [36]:
import plotly.express as px
# Count the occurrences of 0 and 1 in the 'label' column
label_counts = df['label'].value_counts()
# Extract the actual values (0 and 1) as a list
label_values = label_counts.index.to_list()
# Extract the counts (occurrences) as a list
value_counts = label_counts.to_list()
# Create the pie chart
fig = px.pie(values=value_counts, names=label_values)
fig.update_traces(hoverinfo='label+percent',
                  textinfo='percent', 
                  textfont_size=20,
                  marker=dict(colors=['gold', 'mediumturquoise'], 
                              line=dict(color='#000000', width=2)))
fig.update_layout(
    title_text="Label column",
    title_font_color="white",
    paper_bgcolor="black",
    font_color="white") 
# Add data labels with percentages (optional)
# fig.update_traces(textposition='inside', textinfo='percent+n')  # Adjust position and content
fig.show()

In [37]:
df.to_csv("..\\df_processed.csv", index=False)

In [38]:
df = pd.read_csv("..\\df_processed.csv")
df = df.dropna()
df

Unnamed: 0,text,label
0,war christmas update clever people asking join...,0
1,trump possible dhs pick threaten destroy medic...,0
2,trump putin discus support un peace process sy...,1
3,lumberton n c brittany graham confident hurric...,0
4,daily traditionalist calais cancer metastasize...,0
...,...,...
91408,jerusalem reuters israeli defence minister avi...,1
91409,radioactive fukushima wood becomes power germa...,0
91410,republican gunning ninth circuit court effort ...,0
91411,reuters u defense secretary ash carter wednesd...,1


In [39]:
print(len(df["text"]))
print(len(df["label"]))

91413
91413


In [40]:
# print(newdf.iloc[9,:])
# #newdf['label'] = pd.to_numeric(newdf['label'], errors='coerce')
# #newdf.iloc[9,:]
# newdf = newdf.dropna()
# print(newdf.iloc[9,:])

# TF-IDF

In [41]:
# transformer = TfidfTransformer(smooth_idf=True)
# count_vectorizer = CountVectorizer(ngram_range=(1, 2))
# counts = count_vectorizer.fit_transform(df['text'].values)
# tfidf = transformer.fit_transform(counts)
vect = TfidfVectorizer()
tfidf = vect.fit_transform(df['text'].values)
print(tfidf)

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
vect.vocabulary_

# Spliting to training set and test set

In [None]:
x_train, x_test, y_train, y_test = train_test_split(tfidf, df['label'], test_size=0.2 , shuffle=False)

In [None]:
def train(model , model_name):
    model.fit(x_train,y_train)
    print(f"Training accuracy of {model_name} is {round(model.score(x_train,y_train) * 100, 2)}%")
    print(f"testing accuracy of {model_name} is {round(model.score(x_test,y_test) * 100, 2)}")
    
def conf_matrix(model):
    ConfusionMatrixDisplay.from_estimator(model, x_test, y_test)
    
def class_report(model):
    print(classification_report(y_test, model.predict(x_test)))

# Passive Aggresive Classifier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter = 50)
# Fitting on the training set
pac.fit(x_train, y_train)
# Predicting on the test set
y_pred = pac.predict(x_test)
score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {round(score * 100, 2)}%")

# Logistic Regression

In [None]:
model_lr = LogisticRegression()
train(model_lr, "Logistic Regression")
conf_matrix(model_lr)
class_report(model_lr)

# Decision Tree

In [None]:
DT = DecisionTreeClassifier()
train(DT, "Decision Tree")
conf_matrix(DT)
class_report(DT)

# SGD

In [None]:
from sklearn.linear_model import SGDClassifier
SGDC = SGDClassifier(loss='modified_huber', alpha=0.01, max_iter=1000, random_state=42)
train(SGDC, "SGDC")
conf_matrix(SGDC)
class_report(SGDC)

# Support Vector Classification

In [None]:
# from sklearn.svm import SVC
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
# clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
# clf.fit(x_train, y_train)
from sklearn.svm import SVC
SVC = SVC(kernel="rbf", gamma=0.5, C=1.0)
train(SVC, "SVC")
conf_matrix(SVC)
class_report(SVC)

# Naive Bayes

### MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
mnb = MultinomialNB(alpha=0.8, fit_prior=True, force_alpha=True)
train(mnb, "mnb")
conf_matrix(mnb)
class_report(mnb)

### GaussianNB

In [None]:
gnb = GaussianNB()
train(gnb, "gnb")
conf_matrix(gnb)
class_report(gnb)

In [None]:
# clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# models , predictions = clf.fit(x_train, x_test, y_train, y_test)

# print(models)

# Save the vectorizor and model

In [None]:
pickle.dump(vect, open('vect.pkl', 'wb'))
vect = pickle.load(open('vect.pkl', 'rb'))

In [None]:
pickle.dump(pac, open('pac.pkl', 'wb'))
model = pickle.load(open('pac.pkl', 'rb'))

# Classifing a single news article

In [None]:
def fake_news(article):
    article = preprocess_text(article)
    article = [article]
    tfidf = vect.transform(article)
    prediction = model.predict(tfidf)
    return prediction[0] # or return prediction[0]

In [None]:
ans = fake_news("""In these trying times, Jackie Mason is the Voice of Reason. [In this week’s exclusive clip for Breitbart News, Jackie discusses the looming threat of North Korea, and explains how President Donald Trump could win the support of the Hollywood left if the U. S. needs to strike first.  “If he decides to bomb them, the whole country will be behind him, because everybody will realize he had no choice and that was the only thing to do,” Jackie says. “Except the Hollywood left. They’ll get nauseous. ” “[Trump] could win the left over, they’ll fall in love with him in a minute. If he bombed them for a better reason,” Jackie explains. “Like if they have no transgender toilets. ” Jackie also says it’s no surprise that Hollywood celebrities didn’t support Trump’s strike on a Syrian airfield this month. “They were infuriated,” he says. “Because it might only save lives. That doesn’t mean anything to them. If it only saved the environment, or climate change! They’d be the happiest people in the world. ” Still, Jackie says he’s got nothing against Hollywood celebs. They’ve got a tough life in this country. Watch Jackie’s latest clip above.   Follow Daniel Nussbaum on Twitter: @dznussbaum """)
if(ans == 0):
    print("FALSE NEWS!!!")
else:
    print("TRUE NEWS")

In [None]:
ans = fake_news("""Why the Truth Might Get You Fired October 29, 2016 
The tension between intelligence analysts and political policymakers has always been between honest assessments and desired results, with the latter often overwhelming the former, as in the Iraq War, writes Lawrence Davidson. 
By Lawrence Davidson 
For those who might wonder why foreign policy makers repeatedly make bad choices, some insight might be drawn from the following analysis. The action here plays out in the United States, but the lessons are probably universal. 
Back in the early spring of 2003, George W. Bush initiated the invasion of Iraq. One of his key public reasons for doing so was the claim that the countryâ€™s dictator, Saddam Hussein, was on the verge of developing nuclear weapons and was hiding other weapons of mass destruction. The real reason went beyond that charge and included a long-range plan for â€œregime changeâ€ in the Middle East. President George W. Bush and Vice President Dick Cheney receive an Oval Office briefing from CIA Director George Tenet. Also present is Chief of Staff Andy Card (on right). (White House photo) 
For our purposes, we will concentrate on the belief that Iraq was about to become a hostile nuclear power. Why did President Bush and his close associates accept this scenario so readily? 
The short answer is Bush wanted, indeed needed, to believe it as a rationale for invading Iraq. At first he had tried to connect Saddam Hussein to the 9/11 attacks on the U.S. Though he never gave up on that stratagem, the lack of evidence made it difficult to rally an American people, already fixated on Afghanistan, to support a war against Baghdad. 
But the nuclear weapons gambit proved more fruitful, not because there was any hard evidence for the charge, but because supposedly reliable witnesses, in the persons of exiled anti-Saddam Iraqis (many on the U.S. governmentâ€™s payroll ), kept telling Bush and his advisers that the nuclear story was true. 
What we had was a U.S. leadership cadre whose worldview literally demanded a mortally dangerous Iraq, and informants who, in order to precipitate the overthrow of Saddam, were willing to tell the tale of pending atomic weapons. The strong desire to believe the tale of a nuclear Iraq lowered the threshold for proof . Likewise, the repeated assertions by assumed dependable Iraqi sources underpinned a nationwide U.S. campaign generating both fear and war fever. 
So the U.S. and its allies insisted that the United Nations send in weapons inspectors to scour Iraq for evidence of a nuclear weapons program (as well as chemical and biological weapons). That the inspectors could find no convincing evidence only frustrated the Bush administration and soon forced its hand. 
On March 19, 2003, Bush launched the invasion of Iraq with the expectation was that, once in occupation of the country, U.S. inspectors would surely find evidence of those nukes (or at least stockpiles of chemical and biological weapons). They did not. Their Iraqi informants had systematically lied to them. 
Social and Behavioral Sciences to the Rescue? 
The various U.S. intelligence agencies were thoroughly shaken by this affair, and today, 13 years later, their directors and managers are still trying to sort it out â€“ specifically, how to tell when they are getting â€œtrueâ€ intelligence and when they are being lied to. Or, as one intelligence worker has put it, we need â€œ help to protect us against armies of snake oil salesmen. â€ To that end the CIA et al. are in the market for academic assistance. Ahmed Chalabi, head of the Iraqi National Congress, a key supplier of Iraqi defectors with bogus stories of hidden WMD. 
A â€œpartnershipâ€ is being forged between the Office of the Director of National Intelligence (ODNI), which serves as the coordinating center for the sixteen independent U.S. intelligence agencies, and the National Academies of Sciences, Engineering and Medicine . The result of this collaboration will be a â€œ permanent Intelligence Community Studies Boardâ€ to coordinate programs in â€œsocial and behavioral science research [that] might strengthen national security .â€ 
Despite this effort, it is almost certain that the â€œsocial and behavioral sciencesâ€ cannot give the spy agencies what they want â€“ a way of detecting lies that is better than their present standard procedures of polygraph tests and interrogations. But even if they could, it might well make no difference, because the real problem is not to be found with the liars. It is to be found with the believers. 
The Believers 
It is simply not true, as the ODNI leaders seem to assert, that U.S. intelligence agency personnel cannot tell, more often than not, that they are being lied to. This is the case because there are thousands of middle-echelon intelligence workers, desk officers, and specialists who know something closely approaching the truth â€“ that is, they know pretty well what is going on in places like Afghanistan, Iraq, Syria, Libya, Israel, Palestine and elsewhere. Director of National Intelligence James Clapper (right) talks with President Barack Obama in the Oval Office, with John Brennan and other national security aides present. (Photo credit: Office of Director of National Intelligence) 
Therefore, if someone feeds them â€œsnake oil,â€ they usually know it. However, having an accurate grasp of things is often to no avail because their superiors â€“ those who got their appointments by accepting a pre-structured worldview â€“ have different criterion for what is â€œtrueâ€ than do the analysts. 
Listen to Charles Gaukel, of the National Intelligence Council â€“ yet another organization that acts as a meeting ground for the 16 intelligence agencies. Referring to the search for a way to avoid getting taken in by lies, Gaukel has declared, â€œ Weâ€™re looking for truth. But weâ€™re particularly looking for truth that works. â€ Now what might that mean? 
I can certainly tell you what it means historically. It means that for the power brokers, â€œtruthâ€ must match up, fit with, their worldview â€“ their political and ideological precepts. If it does not fit, it does not â€œwork.â€ So the intelligence specialists who send their usually accurate assessments up the line to the policy makers often hit a roadblock caused by â€œgroup think,â€ ideological blinkers, and a â€œwe know betterâ€ attitude. 
On the other hand, as long as what youâ€™re selling the leadership matches up with what they want to believe, you can peddle them anything: imaginary Iraqi nukes, Israel as a Western-style democracy, Saudi Arabia as an indispensable ally, Libya as a liberated country, Bashar al-Assad as the real roadblock to peace in Syria, the Strategic Defense Initiative (SDI) aka Star Wars, a world that is getting colder and not warmer, American exceptionalism in all its glory â€“ the list is almost endless. 
What does this sad tale tell us? If you want to spend millions of dollars on social and behavioral science research to improve the assessment and use of intelligence, forget about the liars. What you want to look for is an antidote to the narrow-mindedness of the believers â€“ the policymakers who seem not to be able to rise above the ideological presumptions of their class â€“ presumptions that underpin their self-confidence as they lead us all down slippery slopes. 
It has happened this way so often, and in so many places, that it is the source of Shakespeareâ€™s determination that â€œwhat is past, is prelude.â€ Our elites play out our destinies as if they have no free will â€“ no capacity to break with structured ways of seeing. Yet the middle-echelon specialists keep sending their relatively accurate assessments up the ladder of power. Hope springs eternal.""")
if(ans == 0):
    print("FALSE NEWS!!!")
else:
    print("TRUE NEWS")