# Machine Learning Intern at Infopillar Solutions for November 2021 Batch

# Author : Anand Bhausaheb Kharabe

# Task 1 :Fake News Detection Project

Project idea – Fake news spreads like a wildfire and this is a big issue in this era. You can learn how to distinguish fake news from a real one. You can use supervised learning to implement a model like this.

Dataset: https://bit.ly/3FxCSC4

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Reading Dataset

In [None]:
t2 = pd.read_csv('news.csv')
t2.head()

# EDA on the dataset variable t2

In [None]:
t2.shape 

In [None]:
t2.info()

In [None]:
t2.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
t2.isnull().sum()  #Checking for NaN values

In [None]:
t2.duplicated().sum()  #Checking for Dublicate Values

In [None]:
t2.drop_duplicates(keep='first', inplace=True)  # Droping the dupicates

In [None]:
sns.countplot(data=t2, x='label')
plt.show()

In [None]:
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer

ps = PorterStemmer()
STOPWORDS = set(stopwords.words('english'))

In [None]:
def tranform_text(text):
    text = text.lower()     # Converting to lower case

    word_arr = nltk.word_tokenize(text)                                                     
    correct = []

    for word in word_arr:
        if (word.isalnum()) and (word not in STOPWORDS) and (word not in punctuation):      
            correct.append(ps.stem(word))                                                   

    return " ".join(correct)

In [None]:
t2['title'] = t2['title'].apply(tranform_text)
t2['text'] = t2['text'].apply(tranform_text)
t2.head()

# Using TF-IDF for Vectorizing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_title = TfidfVectorizer(max_features=3000)
tfidf_text = TfidfVectorizer(max_features=20000)

In [None]:
title = tfidf_title.fit_transform(t2['title']).toarray()
text = tfidf_text.fit_transform(t2['text']).toarray()

In [None]:
print("Shape of Transformed Title :-> ",title.shape) 
print("Shape of Transformed Text :-> ",text.shape)

In [None]:
df_title = pd.DataFrame(title)
df_text = pd.DataFrame(text)

In [None]:
t = pd.concat([df_title, df_text], axis=1)
t.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target = le.fit_transform(dataset['label'])

# Train-Test-Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(t, target, test_size=0.2, random_state=10)

In [None]:
X_train

In [None]:
y_train

# Biulding Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)

clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [None]:
compare = pd.DataFrame({'Actual Values': y_test, 'Predicted Values': y_pred})  
compare

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))