# TEXT VECTORIZATION — COUNT VECTORIZER & TF-IDF

### LIBRARY

In [1]:
import pandas as pd                           # FOR DATA HANDLING
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # FOR TEXT VECTORIZATION
from sklearn.model_selection import train_test_split   # FOR TRAIN-TEST SPLIT
import matplotlib.pyplot as plt               # FOR VISUAL CHECKS

### LOAD CLEANED DATASET 

In [2]:
# LOAD DATASET CONTAINING 'clean_text' AND 'label'
df = pd.read_csv("../DATA/DATA[C].csv")  # CLEANED TEXT DATA

# DISPLAY BASIC INFO
print("SHAPE:", df.shape)
print("COLUMNS:", df.columns)
df.head()

SHAPE: (38647, 3)
COLUMNS: Index(['text', 'label', 'clean_text'], dtype='object')


Unnamed: 0,text,label,clean_text
0,"21st Century Wire says Ben Stein, reputable pr...",0,century wire say ben stein reputable professor...
1,WASHINGTON (Reuters) - U.S. President Donald T...,1,washington reuters president donald trump remo...
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1,reuters puerto rico governor ricardo rossello ...
3,"On Monday, Donald Trump once again embarrassed...",0,monday donald trump embarrassed country accide...
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1,glasgow scotland reuters presidential candidat...


### SPLIT FEATURES (TEXT) AND TARGET (LABEL)

In [3]:
# SET INDEPENDENT VARIABLE: CLEANED TEXT
X = df['clean_text']

# SET TARGET VARIABLE: LABEL (0 = FAKE, 1 = REAL)
y = df['label']

### TRAIN-TEST SPLIT

In [4]:
# SPLIT INTO TRAIN AND TEST SETS — 80% TRAIN, 20% TEST
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# DISPLAY SHAPES
print("X_train SHAPE:", X_train.shape)
print("X_test SHAPE:", X_test.shape)

X_train SHAPE: (30917,)
X_test SHAPE: (7730,)


### COUNT VECTORIZER — WORD FREQUENCY BASED

In [7]:
# FILL MISSING VALUES (NaN) IN TRAIN AND TEST TEXT DATA WITH EMPTY STRINGS
X_train = X_train.fillna('')
X_test = X_test.fillna('')

# INITIALIZE COUNT VECTORIZER WITH A LIMIT OF TOP 5000 FEATURES
count_vectorizer = CountVectorizer(max_features=5000)

# FIT THE COUNT VECTORIZER ON TRAINING DATA AND TRANSFORM TRAINING DATA
X_train_count = count_vectorizer.fit_transform(X_train)

# TRANSFORM TEST DATA USING THE FITTED COUNT VECTORIZER
X_test_count = count_vectorizer.transform(X_test)

# DISPLAY THE SHAPE OF THE TRANSFORMED TRAINING AND TEST DATA
print("COUNT VECTORIZER - X_train SHAPE:", X_train_count.shape)
print("COUNT VECTORIZER - X_test SHAPE:", X_test_count.shape)

COUNT VECTORIZER - X_train SHAPE: (30917, 5000)
COUNT VECTORIZER - X_test SHAPE: (7730, 5000)


### TF-IDF VECTORIZER — TERM FREQUENCY-INVERSE DOCUMENT FREQUENCY

In [8]:
# FILL MISSING VALUES IN TRAIN AND TEST TEXT DATA WITH EMPTY STRINGS
X_train = X_train.fillna('')
X_test = X_test.fillna('')

# INITIALIZE TF-IDF VECTORIZER WITH A LIMIT OF TOP 5000 FEATURES
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# FIT THE TF-IDF VECTORIZER ON TRAINING TEXT AND TRANSFORM TRAINING TEXT
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# TRANSFORM TEST TEXT USING THE FITTED TF-IDF VECTORIZER
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# DISPLAY THE SHAPE OF THE TRANSFORMED TRAINING AND TEST DATA
print("TF-IDF VECTORIZER - X_train SHAPE:", X_train_tfidf.shape)
print("TF-IDF VECTORIZER - X_test SHAPE:", X_test_tfidf.shape)

TF-IDF VECTORIZER - X_train SHAPE: (30917, 5000)
TF-IDF VECTORIZER - X_test SHAPE: (7730, 5000)
