In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report


In [2]:
True_news = pd.read_csv('True.csv')
Fake_news = pd.read_csv('Fake.csv')

In [3]:
True_news['label'] = 0
Fake_news['label'] = 1


In [4]:
true_news = True_news[['text','label']]
fake_news = Fake_news[['text','label']]

In [5]:
dataset = pd.concat([true_news , fake_news])

In [6]:
# dataset = dataset.sample(frac = 1)

In [7]:
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [8]:
ps = WordNetLemmatizer()

stopwords = stopwords.words('english')


In [9]:
def cleaning_data(row):
    row = row.lower() 
    row = re.sub('[^a-zA-Z]' , ' ' , row)
    token = row.split()     
    news = [ps.lemmatize(word) for word in token if not word in stopwords]      
    cleanned_news = ' '.join(news)     
    return cleanned_news 

In [10]:
dataset['text'] = dataset['text'].apply(lambda x : cleaning_data(x))

In [11]:
vectorizer = TfidfVectorizer(max_features = 50000 , lowercase=False , ngram_range=(1,2))

In [12]:
X = dataset.iloc[:35000,0]
y = dataset.iloc[:35000,1]

In [13]:
train_data , test_data , train_label , test_label = train_test_split(X , y , test_size = 0.2 ,random_state = 0)

In [14]:
vec_train_data = vectorizer.fit_transform(train_data)
vec_train_data = vec_train_data.toarray()
vec_test_data = vectorizer.transform(test_data).toarray()
training_data = pd.DataFrame(vec_train_data , columns=vectorizer.get_feature_names_out())
testing_data = pd.DataFrame(vec_test_data , columns= vectorizer.get_feature_names_out())

In [15]:
clf = MultinomialNB()
clf.fit(training_data, train_label)
y_pred  = clf.predict(testing_data)

In [16]:
news = cleaning_data(str("Imposters posing as army personnel on the social media have been called out by the Indian Army as false news and disinformation."))
single_prediction = clf.predict(vectorizer.transform([news]).toarray())
print(single_prediction)
print(clf.predict_proba(vectorizer.transform([news]).toarray()))

[0]
[[0.78965017 0.21034983]]




In [18]:
# clf.predict_proba(vectorizer.transform([news]).toarray())


In [19]:
# True_news.head()

In [20]:
y_pred_train = clf.predict(training_data)
print(classification_report(train_label , y_pred_train))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96     17203
           1       0.95      0.93      0.94     10797

    accuracy                           0.96     28000
   macro avg       0.95      0.95      0.95     28000
weighted avg       0.96      0.96      0.96     28000



In [21]:
print(classification_report(test_label , y_pred))


              precision    recall  f1-score   support

           0       0.95      0.96      0.96      4214
           1       0.94      0.93      0.93      2786

    accuracy                           0.95      7000
   macro avg       0.95      0.94      0.95      7000
weighted avg       0.95      0.95      0.95      7000



In [22]:
print(f"Training data accuracy score: {accuracy_score(train_label , y_pred_train).round(3)}")
print(f"Testing data accuracy score: {accuracy_score(test_label , y_pred).round(3)}")




Training data accuracy score: 0.955
Testing data accuracy score: 0.948



