# UNDERSTANDING DATASET

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


### Dataset Columns for Fake News Detection

1. **id** - Unique identifier for each news article.
2. **title** - Title or headline of the news article.
3. **author** - Author of the article.
4. **text** - Full text content of the article.
5. **label** - Target variable indicating the authenticity of the article (e.g., 0 for fake, 1 for real).


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [4]:
df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [5]:
df.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [6]:
df.count()

id        20800
title     20242
author    18843
text      20761
label     20800
dtype: int64

In [7]:
df['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [8]:
df.author.unique()

array(['Darrell Lucus', 'Daniel J. Flynn', 'Consortiumnews.com', ...,
       'D. Samuelson', 'Judge Andrew Napolitano',
       'Michael J. de la Merced and Rachel Abrams'], dtype=object)

In [9]:
df.shape

(20800, 5)

In [10]:
# Filter authors whose label is 1
authors_label_1 = df[df['label'] == 1]['author'].unique().tolist()
authors_label_1

['Darrell Lucus',
 'Consortiumnews.com',
 'Jessica Purkiss',
 'Howard Portnoy',
 nan,
 'Amando Flavio',
 'Jason Ditz',
 'AnotherAnnie',
 'Starkman',
 'The Doc',
 'Ira Helfand',
 'Amanda Shea',
 'greanfinisher .',
 'b',
 'Alexandria Laredo',
 'Gordon Duff, Senior Editor',
 'Lance Schuttler',
 'Dairy✓ᵀᴿᵁᴹᴾ',
 'Madeline',
 'M.R. Islam',
 'Anonymous Coward (UID 12781064)',
 'Truth Broadcast Network',
 'Admin',
 'Jason Easley',
 'Chris Black',
 'Ivan the Stakhanovets',
 'Redflag Newsdesk',
 'Steve Sailer',
 'Scott Osborn',
 'Heather Callaghan',
 'Kayla Brandon',
 'Mac Slavo',
 'Natural News Editors',
 'Activist Post',
 'The Anti-Media',
 'Amy Moreno',
 'admin',
 'Anonymous',
 'Contributing Author',
 'Eric Striker',
 'SeekSearchDestory',
 'David Stockman',
 'wmw_admin',
 'Editor',
 'beforeitsnews.com',
 'Geoffrey Grider',
 'T Steelman',
 'Zero Hedge',
 'The European Union Times',
 'Andrew Anglin',
 'Colin Taylor',
 'Ari Lieberman',
 'Nathan J. Robinson',
 'Dikran Arakelian (noreply@blogger.c

In [11]:
# Filter authors whose label is 0
authors_label_0 = df[df['label'] == 0]['author'].unique().tolist()
authors_label_0

['Daniel J. Flynn',
 'Daniel Nussbaum',
 'Alissa J. Rubin',
 nan,
 'Megan Twohey and Scott Shane',
 'Aaron Klein',
 'Chris Tomlinson',
 'Jack Williams',
 'Michael Corkery and Stacy Cowley',
 'Jeff Poor',
 'Jerome Hudson',
 'Pam Key',
 'Donald G. McNeil Jr. and Pam Belluck',
 'Aaron Klein and Ali Waked',
 'Jim Dwyer',
 'Mark Landler',
 'Ian Hanchett',
 'Steven Erlanger',
 'Clifford Krauss',
 'Leslie Picker, Danny Hakim and Michael J. de la Merced',
 'Andrew Higgins',
 'AWR Hawkins',
 'Aaron E. Carroll',
 'Dr. Susan Berry',
 'Katie Rogers',
 'Ian Mason',
 'Jim Rutenberg, Ben Protess and Emily Steel',
 'Brian X. Chen',
 'Virginia Hale',
 'Nate Church',
 'Jeff Gordinier',
 'Neil Irwin',
 'Kenneth Chang and Sewell Chan',
 'A. O. Scott',
 'Kristina Wong',
 'Jeremy W. Peters',
 'Jacey Fortin',
 'Brett Anderson',
 'Jane Perlez',
 'Josh Katz',
 'Andrea Kannapell and Sandra Stevenson',
 'Breitbart London',
 'Nicholas Kulish, Vivian Yee, Caitlin Dickerson, Liz Robbins, Fernanda Santos and Jennife

In [None]:
authors_label_1 = set(df[df['label'] == 1]['author'].unique())

authors_label_0 = set(df[df['label'] == 0]['author'].unique())

common_authors = authors_label_1.intersection(authors_label_0)

print("Authors in both label 1 and label 0:")
print(common_authors)


Authors in both label 1 and label 0:
{nan, 'Pamela Geller', 'Ann Coulter', 'AFP', 'Reuters', 'Pam Key'}
