In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score #is used to evaluate the accuracy of a classification model
from sklearn.metrics import classification_report #generates a comprehensive report to evaluate the performance of a classification model. 
#It provides key metrics such as precision, recall, F1-score, and support for each class

import re#re module in Python stands for "regular expressions". 
#It provides support for working with regular expressions, which are powerful tools for pattern matching and text manipulation.

import string #string module in Python provides a collection of constants (like strings of ASCII characters) 
#and useful classes (like Template) that are commonly used in various string operations

In [3]:
data_fk= pd.read_csv("Fake.csv")
data_tr=pd.read_csv("True.csv")

In [None]:
print(data_fk.head())
print('\n')

In [None]:
print(data_tr.head())
print('\n')

In [None]:
data_fk['class']= 0
data_tr['class']= 1

In [None]:
print(data_fk.shape , data_tr.shape)
# shape returns dimension of arrays ((rows , columns))
print('\n')

In [None]:
data_fk_manual_testing = data_fk.tail(10)
for i in range(23480, 23470, -1):
    data_fk.drop(i, axis=0, inplace=True)

data_tr_manual_testing = data_tr.tail(10)
for i in range(21416, 23406, -1):
    data_tr.drop(i, axis=0, inplace=True)


In [None]:
print(data_fk.shape,data_tr.shape)
print('\n')

In [None]:
data_fk_manual_testing['class'] = 0
data_tr_manual_testing['class'] = 1

In [None]:
print(data_tr_manual_testing.head())
print('\n')

In [None]:
print(data_fk_manual_testing.head())
print('\n')

In [None]:
data_merge = pd.concat([data_fk,data_tr] , axis = 0) #0 for rows
print(data_merge.head())
print('\n')

In [None]:
print(data_merge.columns)
print('\n')

In [None]:
data = data_merge.drop(['title','subject','date'],axis=1) #1 for columns

In [None]:
print(data.isnull().sum())
print('\n')

In [None]:
data = data.sample(frac = 1)
#  in pandas DataFrame randomly shuffles (or samples) the rows of the DataFrame data
#frac: This parameter specifies the fraction of rows to return in the random sample. When frac=1, 
#it indicates that all rows should be included in the sample.

In [None]:
print(data.head())
print('\n')

In [None]:
data.reset_index(inplace=True)
data.drop(['index'],axis=1,inplace=True)

In [None]:
print(data.columns)
print('\n')

In [None]:
print(data.head())
print('\n')

In [None]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]' ,'', text)
    text = re.sub("\\W","",text)
    text = re.sub('https?://\S+|www.\.\S+','',text)
    text = re.sub('<.*?>+','',text)
    text = re.sub('[%s]' %re.escape(string.punctuation),'',text)
    text = re.sub('\n','',text)
    text = re.sub('\w*\d\w*','', text)
    return text
                 

In [None]:
data['text']=data['text'].apply(wordopt)

In [None]:
x= data['text']
y=data['class']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
xv_train= vectorization.fit_transform(x_train)
xv_test= vectorization.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(xv_train , y_train)

In [None]:
pred_lr=lr.predict(xv_test)

In [None]:
print(lr.score(xv_test,y_test))
print('\n')

In [None]:
print(classification_report(y_test , pred_lr))
print('\n')

In [None]:
from sklearn.tree  import DecisionTreeClassifier
dt= DecisionTreeClassifier()
dt.fit(xv_train,y_train)

In [None]:
pred_dt=dt.predict(xv_test)

In [None]:
print(dt.score(xv_test,y_test))
print('\n')

In [None]:
print(classification_report(y_test , pred_dt))
print('\n')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier(random_state=0)
gb.fit(xv_train , y_train)

In [None]:
pred_gb = gb.predict(xv_test)

In [None]:
print(gb.score(xv_test,y_test))
print('\n')

In [None]:
print(classification_report(y_test , pred_gb))
print('\n')

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0)
rf.fit(xv_train , y_train)

In [None]:
pred_rf = rf.predict(xv_test)

In [None]:
print(rf.score(xv_test,y_test))
print('\n')

In [None]:
print(classification_report(y_test , pred_rf))
print('\n')

In [None]:
def output_lable(n):
    if n==0:
        return "Fake News"
    elif n==1:
        return "Real News"

def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test['text'] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_lr = lr.predict(new_xv_test)
    pred_dt = dt.predict(new_xv_test)
    pred_gb = gb.predict(new_xv_test)
    pred_rf = rf.predict(new_xv_test)

    return print("\n\nLR Prediction:{} \nDT Prediction: {} \nGBC Prediction: {} \nRF Prediction: {}".format(output_lable(pred_lr[0]),output_lable(pred_dt[0]),output_lable(pred_gb[0]),output_lable(pred_rf[0])))

In [None]:
news = str(input())
manual_testing(news)