In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Download stopwords
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dimpi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dimpi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Step 1: Load and explore the Datasets

##### Load the Datasets

In [2]:
df=pd.read_csv('15_fake_news_detection.csv')
df.head()

Unnamed: 0,title,text,label
0,Government Announces New Education Reforms,The education ministry has proposed reforms to...,real
1,Economy Shows Signs of Recovery,The new study conducted by international resea...,real
2,Aliens Land in Central Park,Sources claim that extraterrestrial beings wer...,fake
3,Aliens Land in Central Park,The celebrity stated that secret documents rev...,fake
4,New Study Reveals Health Benefits of Walking,The education ministry has proposed reforms to...,real


#### Check for Missing Values

In [3]:
df.isnull().sum()

title    0
text     0
label    0
dtype: int64

In [4]:
df['label'].value_counts()

label
real    532
fake    468
Name: count, dtype: int64

### Step2 : Preprocess the Datsets

In [5]:
def preprocess_text(df, use_stemming=False, use_lemmatization=True):
    """
    Preprocesses the 'title' and 'text' columns of a given DataFrame.
    
    Steps:
    1. Convert text to lowercase
    2. Remove punctuation & special characters
    3. Remove stopwords
    4. Apply stemming or lemmatization
    5. Convert text into numerical form using TF-IDF

    Parameters:
    - df: pandas DataFrame containing 'title' and 'text' columns.
    - use_stemming: Boolean flag to apply stemming (default: False)
    - use_lemmatization: Boolean flag to apply lemmatization (default: True)

    Returns:
    - Processed DataFrame with TF-IDF features added.
    """
    
    # 1. Convert text to lowercase
    df['title'] = df['title'].fillna('').astype(str).str.lower()
    df['text'] = df['text'].fillna('').astype(str).str.lower()

    # 2. Remove Punctuation & Special Characters
    df['title'] = df['title'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
    df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

    # 3. Remove Stopwords
    stop_words = set(stopwords.words('english'))
    df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    # 4. Apply Stemming or Lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    if use_stemming:
        df['title'] = df['title'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
        df['text'] = df['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

    if use_lemmatization:
        df['title'] = df['title'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
        df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    # 5. Convert Text to TF-IDF
    vectorizer_title = TfidfVectorizer()
    vectorizer_text = TfidfVectorizer()

    title_tfidf_matrix = vectorizer_title.fit_transform(df['title'])
    text_tfidf_matrix = vectorizer_text.fit_transform(df['text'])

    # Convert sparse matrix to DataFrame
    title_tfidf_df = pd.DataFrame(title_tfidf_matrix.toarray(), columns=vectorizer_title.get_feature_names_out())
    text_tfidf_df = pd.DataFrame(text_tfidf_matrix.toarray(), columns=vectorizer_text.get_feature_names_out())

    # Merge with the original DataFrame
    df = pd.concat([df, title_tfidf_df, text_tfidf_df], axis=1)

    return df



In [6]:
df= preprocess_text(df)
df.head()

Unnamed: 0,title,text,label,aging,alien,announces,benefit,celebrity,central,cure,...,source,spaceship,stated,steadily,stepping,study,tea,two,village,walking
0,government announces new education reform,education ministry proposed reform modernize c...,real,0.0,0.0,0.488463,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,economy show sign recovery,new study conducted international researcher s...,real,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.323667,0.0,0.0,0.0,0.323667
2,alien land central park,source claim extraterrestrial being seen stepp...,fake,0.0,0.5,0.0,0.0,0.0,0.5,0.0,...,0.389794,0.389794,0.0,0.0,0.389794,0.0,0.0,0.0,0.0,0.0
3,alien land central park,celebrity stated secret document reveal shocki...,fake,0.0,0.5,0.0,0.0,0.0,0.5,0.0,...,0.0,0.0,0.362895,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,new study reveals health benefit walking,education ministry proposed reform modernize c...,real,0.0,0.0,0.0,0.443386,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# # 1.coverting text to lowercase
# df['title']=df['title'].str.lower()
# df['text']=df['text'].str.lower()
# df.head()

In [8]:
# #  2.Remove Punctuation & Special Characters
# df['title'] = df['title'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
# df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
# df.head()

In [9]:
# # 3. Remove Stopwords

# #Load English Stopwords
# stop_words=set(stopwords.words('english'))
# df['title']=df['title'].apply(lambda x:x.split())
# df['text']=df['text'].apply(lambda x:x.split())
# df['title']=df['title'].apply(lambda words: [word for word in words if word.lower() not in stop_words])
# df['text']=df['text'].apply(lambda words:[word for word in words if word.lower() not in stop_words] )
# df.head()


In [10]:
# # 4. Perform Stemming or Lemmatization
# #Stemming
# stemmer=PorterStemmer()
# df['title']=df['title'].apply(lambda words:[stemmer.stem(word) for word in words])
# df['text']=df['text'].apply(lambda words:[stemmer.stem(word) for word in words])

# #Lemmatization
# lemmatizer=WordNetLemmatizer()
# df['title']=df['title'].apply(lambda words:[lemmatizer.lemmatize(word) for word in words])
# df['title']=df['text'].apply(lambda words:[lemmatizer.lemmatize(word) for word in words])
# df.head()

In [11]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# corpus = ["Fake news is spreading", "Real news provides facts"]
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(corpus)

# print(vectorizer.get_feature_names_out())  # Vocabulary
# print(X.toarray())  # TF-IDF matrix
# ['fake' 'facts' 'is' 'news' 'provides' 'real' 'spreading']



In [12]:
# # 5. Tokenization
# df['title']=df['title'].apply(lambda x:word_tokenize(str(x)))
# df['text']=df['text'].apply(lambda x: word_tokenize(str(x)))
# df.head()
# # #6 Convert text into Numerical from --> using TF-IDF(term frequency-Inverse Document frequency)
# vectorizer_title = TfidfVectorizer()
# vectorizer_text = TfidfVectorizer()

# # Ensure text data is clean
# df['title'] = df['title'].fillna('').astype(str)
# df['text'] = df['text'].fillna('').astype(str)

# # Apply TF-IDF separately
# title_tfidf_matrix = vectorizer_title.fit_transform(df['title'])
# text_tfidf_matrix = vectorizer_text.fit_transform(df['text'])

# # Convert to DataFrame
# title_tfidf_df = pd.DataFrame(title_tfidf_matrix.toarray(), columns=vectorizer_title.get_feature_names_out())
# text_tfidf_df = pd.DataFrame(text_tfidf_matrix.toarray(), columns=vectorizer_text.get_feature_names_out())

# # Merge with original DataFrame
# df = pd.concat([df, title_tfidf_df, text_tfidf_df], axis=1)

# # Display transformed data
# df.head()
# # df['title']=vectorizer.fit_transform(df['title'])


### Step 3. Split Data into training and testing Sets

In [13]:
#1.  Define Feature(X) and target (y)
X=df.drop(columns=['label'])
Y=df['label']
#2. Perform Train-Test Split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42,stratify=Y)

### Step 4: Train the SVM Model

### Step 5: Evaluate the Model

### Step 6: Optimize the Model

### Step 7: Deploy the Model (Optional)