<a href="https://colab.research.google.com/github/Areeff10/Fake-news-prediction/blob/main/Fake_news_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the Libraries

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize,sent_tokenize
from string import punctuation
from nltk.stem import SnowballStemmer,PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Loading the dataset

In [None]:
dataset=pd.read_csv('/content/news_dataset.csv')
dataset.head(5)

Unnamed: 0,id,title,text,label
0,0.0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1.0
1,1.0,,Did they post their votes for Hillary already?,1.0
2,2.0,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1.0
3,3.0,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0.0
4,4.0,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1.0


# Exploratory data analysis (EDA)

In [None]:
dataset.shape

(1107, 4)

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107 entries, 0 to 1106
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      1106 non-null   float64
 1   title   1094 non-null   object 
 2   text    1106 non-null   object 
 3   label   1105 non-null   float64
dtypes: float64(2), object(2)
memory usage: 34.7+ KB


In [None]:
dataset.isnull().sum()

Unnamed: 0,0
id,1
title,13
text,1
label,2


In [None]:
dataset = dataset.dropna(subset=['text'])

In [None]:
dataset['label'] = dataset['label'].fillna(dataset['label'].mode()[0])

In [None]:
dataset['label'] = dataset['label'].astype(int)

In [None]:
dataset.isnull().sum()

Unnamed: 0,0
id,0
title,12
text,0
label,0


# Feature engineering and preprocessing.

In [None]:
def transform_text(text):
    text = text.lower()
    text = word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum() or i not in punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english'):
            y.append(i)


    ps = PorterStemmer()
    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
dataset['transform_text']=dataset['text'].apply(transform_text)

# Converting the dataset from textual data to numerical representation.

In [None]:
cv=CountVectorizer()
input_data=cv.fit_transform(dataset['transform_text']).toarray()
output_data=dataset['label']

# Partitioning the dataset into training and testing

In [None]:
x_train,x_test,y_train,y_test=train_test_split(input_data,output_data,test_size=0.2,random_state=31)

# Logistic regression

In [None]:
lr=LogisticRegression()
lr.fit(x_train,y_train)
print("training accuracy:",lr.score(x_train,y_train),"\ntesting accuracy:",lr.score(x_test,y_test))
print("accuracy:",accuracy_score(y_test,lr.predict(x_test)))
print("f1_score:",f1_score(y_test,lr.predict(x_test)))

training accuracy: 1.0 
testing accuracy: 0.8648648648648649
accuracy: 0.8648648648648649
f1_score: 0.8800000000000001


-----------------------------------------------------------------------------

## Initially, we imported the necessary libraries, uploaded the dataset, and conducted exploratory data analysis (EDA). Subsequently, we transformed the textual data into numerical representations. The dataset was then partitioned into training and testing sets, followed by the implementation of the following models:



### * Logistic Regression: This supervised machine learning model is employed for classification tasks, where the objective is to predict the probability of an instance belonging to a particular class. This model provided the best accuracy, with **1.00** on the training data and **0.93** on the testing data.

