In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.utils import shuffle
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
fake = pd.read_csv('/content/drive/MyDrive/Datasets/Fake.csv', delimiter = ',') 
true = pd.read_csv('/content/drive/MyDrive/Datasets/True.csv', delimiter = ',')

In [4]:
fake['target'] = 0 
true['target'] = 1 

df = pd.DataFrame() 
df = true.append(fake)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   target   44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [6]:
# Shuffle the data 
df = shuffle(df).reset_index(drop=True) 
df.head()

Unnamed: 0,title,text,subject,date,target
0,This One Tweet Explains How Close Trump Is To...,Americans like to sit back and smugly announce...,News,"March 1, 2016",0
1,"Trump outlines plans for first day in office, ...",NEW YORK/WASHINGTON (Reuters) - U.S. President...,politicsNews,"November 21, 2016",1
2,WATCH: Trump Surrogate Tries To Defend Trump ...,It s a trap! That s the messaging that the T...,News,"July 31, 2016",0
3,WIKILEAKS EMAIL: Hillary Camp Calls Conservati...,"In another Wikileaks email dump, the Hillary c...",politics,"Oct 12, 2016",0
4,WATCH: PRESIDENT TRUMP Hilariously Exposes Hyp...,"Before Trump, Democrats and their allies in th...",left-news,"May 11, 2017",0


In [7]:
df['text']=df['text']+" "+df['title']
df.drop(['title','subject','date'],axis=1,inplace=True) 
df.head()

Unnamed: 0,text,target
0,Americans like to sit back and smugly announce...,0
1,NEW YORK/WASHINGTON (Reuters) - U.S. President...,1
2,It s a trap! That s the messaging that the T...,0
3,"In another Wikileaks email dump, the Hillary c...",0
4,"Before Trump, Democrats and their allies in th...",0


In [8]:
from collections import defaultdict 
# define a function to get list of words according to the target class 
def create_words(target): 
  words = [] 
  for x in df[df['target']==target]['text'].str.split(): 
    for i in x: 
      words.append(i) 
  return words

In [11]:
import re
import string

def clean_text(text): 
  lemmatizer = WordNetLemmatizer() 
  stopwords_english = stopwords.words('english') 
  text= re.sub('\[[^]]*\]', '', text) 
  # remove stock market tickers like $GE
  text = re.sub(r'\$\w*', '', text) 
  #removal of html tags 
  review =re.sub(r'<.*?>',' ',text) 
  # remove old style retweet text "RT" 
  text = re.sub(r'^RT[\s]+', '', text) 
  # remove hyperlinks 
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text) 
  # remove hashtags 
  # only removing the hash # sign from the word 
  text = re.sub(r'#', '', text) 
  text = re.sub("["
                 u"\U0001F600-\U0001F64F" # removal of emoticons 
                 u"\U0001F300-\U0001F5FF" # symbols & pictographs 
                 u"\U0001F680-\U0001F6FF" # transport & map symbols 
                 u"\U0001F1E0-\U0001F1FF" # flags (iOS) 
                 u"\U00002702-\U000027B0" 
                 u"\U000024C2-\U0001F251" 
                 "]+",' ',text) 
  text = re.sub('[^a-zA-Z]',' ',text)
  
  text = text.lower() 
  text_tokens =word_tokenize(text) 
  
  text_clean = [] 
  for word in text_tokens: 
    if (word not in stopwords_english and # remove stopwords
          word not in string.punctuation): # remove punctuation 
        lem_word =lemmatizer.lemmatize(word) # lemmitiging word 
        text_clean.append(lem_word) 
  text_mod=[i for i in text_clean if len(i)>2] 
  text_clean=' '.join(text_mod) 
  return text_clean

In [12]:
df['clean_text']=df['text'].apply(lambda x: clean_text(x))

In [13]:
df.head()

Unnamed: 0,text,target,clean_text
0,Americans like to sit back and smugly announce...,0,american like sit back smugly announce kind se...
1,NEW YORK/WASHINGTON (Reuters) - U.S. President...,1,new york washington reuters president elect do...
2,It s a trap! That s the messaging that the T...,0,trap messaging trump campaign decided asked tr...
3,"In another Wikileaks email dump, the Hillary c...",0,another wikileaks email dump hillary camp caug...
4,"Before Trump, Democrats and their allies in th...",0,trump democrat ally leftist medium rarely worr...


In [23]:
my_tags = []
my_tags.append('fake')
my_tags.append('true')
my_tags

['fake', 'true']

In [15]:
X = df['clean_text']
y = df['target']

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])

In [18]:
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(C=100000.0, n_jobs=1))])

In [19]:
y_pred = logreg.predict(X_test)

In [26]:
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.9953229398663697
              precision    recall  f1-score   support

        fake       0.99      1.00      1.00      4713
        true       1.00      0.99      1.00      4267

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

