<a href="https://colab.research.google.com/github/Abishethvarman/FakeNewsDetection/blob/classifier/3.4-FakeNewsClassfier_Random_Forest_20k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

!ln -s /content/gdrive/My\ Drive/ /mydrive
!ls /mydrive

In [2]:
%cd /mydrive/dataset/

/content/gdrive/My Drive/dataset


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import gensim
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [5]:
df=pd.read_csv('fake-news-category-PES.csv')

In [6]:
df

Unnamed: 0,corpus,category
0,JOHN CONYERS’ LAWYER Threatens ‘If This Nonsen...,Politics
1,"(CNN Student News) -- December 20, 2013 . In o...",Entertainment
2,"ATLANTA, Georgia (CNN) -- So much for Southern...",Entertainment
3,George Soros-Financed Groups Scheme to Stop Tr...,Politics
4,NOT HILLARY’S TURN: LIB PUBLICATIONS ARE SAYIN...,Politics
...,...,...
59995,hahahahaha we missed 123455323 goals haha,Sports
59996,By . Katherine Faulkner and Nick Fagge . PUBLI...,Entertainment
59997,Trump Attacks New Mexico’s GOP Governor After...,Politics
59998,Trump Once Again Demonstrates Why His Russia ...,Politics


In [7]:
# Assuming df is your DataFrame
category_mapping = {'Politics': 1, 'Entertainment': 2, 'Sports': 3}
df['label'] = df['category'].map(category_mapping)

In [8]:
df

Unnamed: 0,corpus,category,label
0,JOHN CONYERS’ LAWYER Threatens ‘If This Nonsen...,Politics,1
1,"(CNN Student News) -- December 20, 2013 . In o...",Entertainment,2
2,"ATLANTA, Georgia (CNN) -- So much for Southern...",Entertainment,2
3,George Soros-Financed Groups Scheme to Stop Tr...,Politics,1
4,NOT HILLARY’S TURN: LIB PUBLICATIONS ARE SAYIN...,Politics,1
...,...,...,...
59995,hahahahaha we missed 123455323 goals haha,Sports,3
59996,By . Katherine Faulkner and Nick Fagge . PUBLI...,Entertainment,2
59997,Trump Attacks New Mexico’s GOP Governor After...,Politics,1
59998,Trump Once Again Demonstrates Why His Russia ...,Politics,1


In [9]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','would'])

In [10]:
def preprocess(text):
    result = []
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    if pd.notna(text):
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
                result.append(lemmatizer.lemmatize(token))

    return ' '.join(result)

In [11]:
df['clean_corpus'] = df['corpus'].apply(preprocess)

In [12]:
print(df['clean_corpus'][0])

john conyers lawyer threatens nonsense continues harbinger come video john conyers lawyer arnold reed threatened come press conference today lady gentleman promise nonsense continues harbinger come reed said defending congressman press conference detroit reed trying blame accuser conyers ready name marion brown staffer conyers revealed week accuser thursday interview today alleged conyers violated body frequently proposition barrister ball lawyer john conyersreed push reed pushed allegation showed signed statement conyers staffer stated witness harassment sexual misconduct congressman reed head urban dictionary said victim conyers mean kissing public relationship campbell indicates verified congressman hired accuser daughter problem hell broke loose congressman fired daughter sudden sexual harassment sexual allegation problem predator talking barrister ball barrister ball reed said held photograph appeared conyers brown posing picture brown said took opportunity hour harass animal barr

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_corpus'], df['label'], test_size=0.2, random_state=42)

In [14]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [15]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_tfidf, y_train)

In [16]:
y_pred = classifier.predict(X_test_tfidf)

In [17]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.96


In [18]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           1       0.94      0.93      0.94      3958
           2       0.94      0.94      0.94      4020
           3       1.00      1.00      1.00      4022

    accuracy                           0.96     12000
   macro avg       0.96      0.96      0.96     12000
weighted avg       0.96      0.96      0.96     12000

Confusion Matrix:
[[3696  258    4]
 [ 225 3788    7]
 [   0    0 4022]]


In [25]:
# phrase_1 = 'Henry Dunant came for a delegation in Italy to discuss about the guyana issue'
phrase_1 = 'Bill Clinton for a delegation in Italy to discuss about the guyana issue'

In [26]:
cleaned_new_phrase = preprocess(phrase_1)

new_phrase_tfidf = tfidf_vectorizer.transform([cleaned_new_phrase])

predicted_class = classifier.predict(new_phrase_tfidf)[0]

predicted_class_name = {1: 'Politics', 2: 'Entertainment', 3: 'Sports'}.get(predicted_class, 'Unknown')

print(f"The predicted class for the phrase '{phrase_1}' is: {predicted_class_name}")

The predicted class for the phrase 'Bill Clinton for a delegation in Italy to discuss about the guyana issue' is: Politics


In [24]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.96
Precision: 0.96
Recall: 0.96
F1 Score: 0.96


-------------------------------------