# FAKE_NEWS_DETECTION(NLP) 

In [None]:
import pandas as pd 

In [None]:
data=pd.read_csv('news.csv')

In [None]:
data.head(10)

In [None]:
data.shape

## Cleaning data

In [None]:
# now we can see Unnamed: 0 column that is no use 
# and droping title that is just heading of text

In [None]:
data.drop(['Unnamed: 0','title'],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
# we can now check for Nan value

In [None]:
data.isnull().sum()

In [None]:
# now changing label as it is Categorical data we have to change it to numeric value(0,1).
# we can change it by different encoder technique like labelencoder or one hot encoder but,binary 
# this can be done with replace method also.

In [None]:
data.label.replace(('FAKE','REAL'),(0,1),inplace=True)

In [None]:
data.head()

In [None]:
X=data.drop('label',axis=1)
Y=data['label']

### visualization

In [None]:
import seaborn as sns 
sns.countplot(x='label',data=data)

###### label is evenly distributed good to go ahead

In [None]:
# now using Natural language processing

In [None]:
import re  # regular pattern of regualr expression are created and we can escape some data which is not needed

In [None]:
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()
corpus = []
for i in range(len(data)):
    review=re.sub('[^a-z-A-Z]', ' ',data['text'][i])
    review=review.lower()
    review=review.split()
    
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review= ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(1,3))
X= tfidf_v.fit_transform(corpus).toarray()

In [None]:
X

### splitting dataset by train_test_split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25,random_state=0)

In [None]:
Y_test.head()

tfidf_v.get_feature_names()[:20]

In [None]:
tfidf_v.get_params()

In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
count_data=pd.DataFrame(X_train, columns=tfidf_v.get_feature_names())

In [None]:
count_data.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_confusion_matrix(cm, classes, normalize=False,
                         title='Confusion matrix', 
                         cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks=np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Multinomial Naive-Bayes Algorithm

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

In [None]:
import numpy as np
import itertools
from sklearn.metrics import accuracy_score,confusion_matrix

In [None]:
classifier.fit(X_train,Y_train)
pred=classifier.predict(X_test)
score=accuracy_score(Y_test, pred)*100
print("accuracy:   %0.3f"% score)
cm=confusion_matrix(Y_test, pred)
plot_confusion_matrix(cm, classes=["FAKE","REAL"])

In [None]:
# Multinomial classifier with hyperparameter(HYPERTUNING) of naive bayes

In [None]:
classifier=MultinomialNB(alpha=0.1)

In [None]:
previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,Y_train)
    y_pred=sub_classifier.predict(X_test)
    score=accuracy_score(Y_test,y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {},Score: {}".format(alpha,score))

In [None]:
# for alpha=0 applying naive-bayes

In [None]:
classifier=MultinomialNB(alpha=0)
classifier.fit(X_train,Y_train)
pred=classifier.predict(X_test)
score=accuracy_score(Y_test, pred)*100
print("accuracy:   %0.3f"% score)
cm=confusion_matrix(Y_test, pred)
plot_confusion_matrix(cm, classes=["FAKE","REAL"])

## Passive aggreesive classifier algorithm

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(n_iter_no_change=50)

In [None]:
linear_clf.fit(X_train,Y_train)
pred = linear_clf.predict(X_test)
score=accuracy_score(Y_test, pred)*100
print("accuracy:   %0.3f"% score)
cm=confusion_matrix(Y_test, pred)
plot_confusion_matrix(cm, classes=["FAKE DATA","REAL DATA"])

In [None]:
feature_names=tfidf_v.get_feature_names()

In [None]:
#most fake
sorted(zip(classifier.coef_[0],feature_names))[:20]

In [None]:
#save_model
import pickle
filename= 'fake_news.pkl'
pickle.dump(linear_clf, open(filename, 'wb'))